Merge pull request #5 from graehl/master

c++11
author: Rico Sennrich <rico.sennrich@gmx.ch> 2015-07-17 21:39:42 +0300
committer: Rico Sennrich <rico.sennrich@gmx.ch> 2015-07-17 21:39:42 +0300
commit: a7da1b618082964152054b00c142e5962e4ca692 (patch)
tree: 45872fc848d3729e8632af0ffdc431726e39e7a2
parent: 28bdadf328c63ee086e8aa5de23cfe0c11728c5b (diff)
parent: c461c4ad7232274dab8405b736bb1ac55cc7874d (diff)
20 files changed, 3304 insertions, 3145 deletions
diff --git a/.gitignore b/.gitignore
index 23c4020..2843613 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,5 @@ src/prepareNeuralTM
 src/testNeuralLM
 src/testNeuralNetwork
 src/trainNeuralNetwork
+.history
+src/make.sh
diff --git a/src/Activation_function.h b/src/Activation_function.h
index 66342bb..742c2fc 100644
--- a/src/Activation_function.h
+++ b/src/Activation_function.h
@@ -3,7 +3,6 @@
 
 #include <cmath>
 #include <string>
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "util.h"
@@ -19,28 +18,28 @@ enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunc
 
 inline activation_function_type string_to_activation_function (const std::string &s)
 {
-    if (s == "identity")
-        return Identity;
-    else if (s == "rectifier")
-        return Rectifier;
-    else if (s == "tanh")
-        return Tanh;
-    else if (s == "hardtanh")
-        return HardTanh;
-    else
-        return InvalidFunction;
+  if (s == "identity")
+    return Identity;
+  else if (s == "rectifier")
+    return Rectifier;
+  else if (s == "tanh")
+    return Tanh;
+  else if (s == "hardtanh")
+    return HardTanh;
+  else
+    return InvalidFunction;
 }
 
 inline std::string activation_function_to_string (activation_function_type f)
 {
-    if (f == Identity)
-        return "identity";
-    else if (f == Rectifier)
-        return "rectifier";
-    else if (f == Tanh)
-        return "tanh";
-    else if (f == HardTanh)
-        return "hardtanh";
+  if (f == Identity)
+    return "identity";
+  else if (f == Rectifier)
+    return "rectifier";
+  else if (f == Tanh)
+    return "tanh";
+  else if (f == HardTanh)
+    return "hardtanh";
 }
 
 struct hardtanh_functor {
@@ -69,51 +68,53 @@ struct drectifier_functor {
 
 class Activation_function
 {
-        int size;
-	activation_function_type f;
-
-    public:
-        Activation_function() : size(0), f(Rectifier) { }
-
-	void resize(int size) { this->size = size; }
-	void set_activation_function(activation_function_type f) { this->f = f; }
-
-	template <typename Engine>
-	void initialize(Engine &engine, bool init_normal, double init_range) { }
-
-	int n_inputs () const { return size; }
-	int n_outputs () const { return size; }
-
-        template <typename DerivedIn, typename DerivedOut>
-	void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
-        {
-	    UNCONST(DerivedOut, output, my_output);
-
-	    switch (f)
-	    {
-	    case Identity: my_output = input; break;
-	    case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
-	    case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
-	    case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
-	    }
-        }
-
-        template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
-	void bProp(const MatrixBase<DerivedGOut> &input, 
-      MatrixBase<DerivedGIn> &output,
-		   const MatrixBase<DerivedIn> &finput,
-       const MatrixBase<DerivedOut> &foutput) const
-        {
-	    UNCONST(DerivedGIn, output, my_output);
-
-	    switch (f)
-	    {
-	    case Identity: my_output = input; break;
-	    case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
-	    case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
-	    case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
-	    }
-        }
+  int size;
+  activation_function_type f;
+
+ public:
+  Activation_function() : size(0), f(Rectifier) { }
+
+  void resize(int size) { this->size = size; }
+  void set_activation_function(activation_function_type f) { this->f = f; }
+
+  template <typename Engine>
+  void initialize(Engine &engine, bool init_normal, double init_range) { }
+
+  int n_inputs () const { return size; }
+  int n_outputs () const { return size; }
+
+  template <typename DerivedIn, typename DerivedOut>
+  void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
+  {
+    UNCONST(DerivedOut, output, my_output);
+
+    switch (f)
+    {
+      case Identity: my_output = input; break;
+      case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
+      case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
+      case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
+      case InvalidFunction: std::abort();
+    }
+  }
+
+  template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
+  void bProp(const MatrixBase<DerivedGOut> &input,
+             MatrixBase<DerivedGIn> &output,
+             const MatrixBase<DerivedIn> &finput,
+             const MatrixBase<DerivedOut> &foutput) const
+  {
+    UNCONST(DerivedGIn, output, my_output);
+
+    switch (f)
+    {
+      case Identity: my_output = input; break;
+      case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
+      case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
+      case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
+      case InvalidFunction: std::abort();
+    }
+  }
 };
 
 } // namespace nplm
diff --git a/src/Makefile b/src/Makefile
index 1611ccb..2a27405 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,12 +1,12 @@
 ### Compilation options.
 
 # C++ compiler. Tested with g++ and Intel icpc.
-CXX=/usr/bin/g++
+CXX=g++
 #CXX=icpc
 
 # Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance!
 #CFLAGS=-g
-CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG 
+CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG  $(CXXFLAGS)
 
 # Architecture. Set to x86_64 or i686 to override.
 ARCH:=$(shell uname -m)
diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h
index bc55762..d89cde6 100644
--- a/src/SoftmaxLoss.h
+++ b/src/SoftmaxLoss.h
@@ -1,7 +1,6 @@
-	#ifndef SOFTMAXLOSS_H
+#ifndef SOFTMAXLOSS_H
 #define SOFTMAXLOSS_H
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 #include "multinomial.h"
 #include "util.h"
@@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss };
 
 inline loss_function_type string_to_loss_function (const std::string &s)
 {
-    if (s == "log")
-        return LogLoss;
-    else if (s == "nce")
-        return NCELoss;
-    else
-        return InvalidLoss;
+  if (s == "log")
+    return LogLoss;
+  else if (s == "nce")
+    return NCELoss;
+  else
+    return InvalidLoss;
 }
 
 inline std::string loss_function_to_string (loss_function_type f)
 {
-    if (f == LogLoss)
-        return "log";
-    else if (f == NCELoss)
-        return "nce";
+  if (f == LogLoss)
+    return "log";
+  else if (f == NCELoss)
+    return "nce";
 }
 
 /// Note: Outputs log-probabilities.
 
 struct SoftmaxLogLoss
 {
-    template <typename DerivedI, typename DerivedW, typename DerivedO>
-    void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+  template <typename DerivedI, typename DerivedW, typename DerivedO>
+  void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+  {
+    UNCONST(DerivedO, output_const, output);
+
+    double log_likelihood = 0.0;
+
+#pragma omp parallel for reduction(+:log_likelihood)
+    for (int train_id = 0; train_id < input.cols(); train_id++)
     {
-        UNCONST(DerivedO, output_const, output);
-
-	double log_likelihood = 0.0;
-
-        #pragma omp parallel for reduction(+:log_likelihood)
-	for (int train_id = 0; train_id < input.cols(); train_id++)
-	{
-	    double normalization = logsum(input.col(train_id));
-	    output.col(train_id).array() = input.col(train_id).array() - normalization;
-	    log_likelihood += output(output_words(train_id), train_id);
-	}
-	loss = log_likelihood;
+      double normalization = logsum(input.col(train_id));
+      output.col(train_id).array() = input.col(train_id).array() - normalization;
+      log_likelihood += output(output_words(train_id), train_id);
     }
-
-    template <typename DerivedW, typename DerivedO, typename DerivedI>
-    void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+    loss = log_likelihood;
+  }
+
+  template <typename DerivedW, typename DerivedO, typename DerivedI>
+  void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+  {
+    UNCONST(DerivedI, grad_input_const, grad_input);
+    grad_input.setZero();
+#pragma omp parallel for
+    for (int train_id = 0; train_id < output.cols(); train_id++)
     {
-        UNCONST(DerivedI, grad_input_const, grad_input);
-        grad_input.setZero();
-        #pragma omp parallel for
-	for (int train_id = 0; train_id < output.cols(); train_id++)
-	{
-	    grad_input(output_words(train_id), train_id) += 1.;
-	    grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
-	}
+      grad_input(output_words(train_id), train_id) += 1.;
+      grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
     }
+  }
 };
 
 ///// Softmax layer plus NCE loss function.
@@ -81,55 +80,55 @@ struct SoftmaxLogLoss
 template <typename Multinomial>
 class SoftmaxNCELoss
 {
-    const Multinomial &unigram;
+  const Multinomial &unigram;
 
-public:
-    SoftmaxNCELoss(const Multinomial &unigram) 
+ public:
+  SoftmaxNCELoss(const Multinomial &unigram)
       : unigram(unigram)
+  {
+  }
+
+  template <typename DerivedI, typename DerivedW, typename DerivedO>
+  void fProp(const MatrixBase<DerivedI> &scores,
+             const MatrixBase<DerivedW> &minibatch_samples,
+             const MatrixBase<DerivedO> &output_const, double &loss)
+  {
+    UNCONST(DerivedO, output_const, output);
+    double log_likelihood = 0.0;
+    int num_noise_samples = minibatch_samples.rows()-1;
+    double log_num_noise_samples = std::log(num_noise_samples);
+#pragma omp parallel for reduction(+:log_likelihood) schedule(static)
+    for (int train_id = 0; train_id < scores.cols(); train_id++)
     {
+      for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
+      {
+        int sample = minibatch_samples(sample_id, train_id);
+        // To avoid zero or infinite probabilities,
+        // never take exp of score without normalizing first,
+        // even if it's a little slower...
+        double score = scores(sample_id, train_id);
+        double score_noise = log_num_noise_samples + unigram.logprob(sample);
+        double z = logadd(score, score_noise);
+        double logprob = score - z;
+        double logprob_noise = score_noise - z;
+        output(sample_id, train_id) = std::exp(logprob);
+        log_likelihood += sample_id == 0 ? logprob : logprob_noise;
+      }
     }
-
-    template <typename DerivedI, typename DerivedW, typename DerivedO>
-    void fProp(const MatrixBase<DerivedI> &scores, 
-	       const MatrixBase<DerivedW> &minibatch_samples,
-	       const MatrixBase<DerivedO> &output_const, double &loss)
-    {
-        UNCONST(DerivedO, output_const, output);
-	double log_likelihood = 0.0;
-	int num_noise_samples = minibatch_samples.rows()-1;
-	double log_num_noise_samples = std::log(num_noise_samples);
-        #pragma omp parallel for reduction(+:log_likelihood) schedule(static)
-	for (int train_id = 0; train_id < scores.cols(); train_id++)
-	{
-	    for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
-	    {
-	        int sample = minibatch_samples(sample_id, train_id);
-		// To avoid zero or infinite probabilities,
-		// never take exp of score without normalizing first,
-		// even if it's a little slower...
-		double score = scores(sample_id, train_id);
-		double score_noise = log_num_noise_samples + unigram.logprob(sample);
-		double z = logadd(score, score_noise);
-		double logprob = score - z;
-		double logprob_noise = score_noise - z;
-		output(sample_id, train_id) = std::exp(logprob);
-		log_likelihood += sample_id == 0 ? logprob : logprob_noise;
-	    }
-	}
-	loss = log_likelihood;
-    }
-
-    template <typename DerivedO, typename DerivedI>
-    void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+    loss = log_likelihood;
+  }
+
+  template <typename DerivedO, typename DerivedI>
+  void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+  {
+    UNCONST(DerivedI, output_const, output);
+#pragma omp parallel for schedule(static)
+    for (int train_id = 0; train_id < probs.cols(); train_id++)
     {
-        UNCONST(DerivedI, output_const, output);
-        #pragma omp parallel for schedule(static)
-	for (int train_id = 0; train_id < probs.cols(); train_id++)
-	{
-	    output.col(train_id) = -probs.col(train_id);
-	    output(0, train_id) += 1.0;
-	}
+      output.col(train_id) = -probs.col(train_id);
+      output(0, train_id) += 1.0;
     }
+  }
 };
 
 } // namespace nplm
diff --git a/src/USCMatrix.h b/src/USCMatrix.h
index 02aeb33..784fa1b 100644
--- a/src/USCMatrix.h
+++ b/src/USCMatrix.h
@@ -1,7 +1,6 @@
 #ifndef USCMATRIX_H
 #define USCMATRIX_H
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 #include "maybe_omp.h"
 #include "util.h"
@@ -34,108 +33,108 @@ template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_
 class USCMatrix
 {
 
-public:
-    Matrix<Index,Dynamic,Dynamic> indexes;
-    Matrix<Scalar,Dynamic,Dynamic> values;
-    int m_rows;
+ public:
+  Matrix<Index,Dynamic,Dynamic> indexes;
+  Matrix<Scalar,Dynamic,Dynamic> values;
+  int m_rows;
 
-    USCMatrix() : m_rows(0) { }
+  USCMatrix() : m_rows(0) { }
 
-    template <typename Indexes, typename Values>
-    USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) 
-    : 
-      indexes(indexes), 
-      values(values), 
-      m_rows(rows) 
-    { }
+  template <typename Indexes, typename Values>
+  USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values)
+      :
+      indexes(indexes),
+      values(values),
+      m_rows(rows)
+  { }
 
-    USCMatrix(Index rows, Index nnz, Index cols) 
-    : 
-      indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), 
+  USCMatrix(Index rows, Index nnz, Index cols)
+      :
+      indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)),
       values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)),
       m_rows(rows)
-    { 
-        this->indexes.fill(-1); 
-    }
-
-    Index rows() const { return m_rows; }
-    Index cols() const { return indexes.cols(); }
-
-    void resize(Index rows, Index nnz, Index cols) {
-        indexes.resize(nnz, cols);
-        values.resize(nnz, cols);
-	m_rows = rows;
-    }
+  {
+    this->indexes.fill(-1);
+  }
+
+  Index rows() const { return m_rows; }
+  Index cols() const { return indexes.cols(); }
+
+  void resize(Index rows, Index nnz, Index cols) {
+    indexes.resize(nnz, cols);
+    values.resize(nnz, cols);
+    m_rows = rows;
+  }
 };
 
 // Dense matrix - sparse matrix product
 // a is presumably very wide
 template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC>
-void uscgemm(double alpha, const MatrixBase<DerivedA> &a, 
-	     const USCMatrix<ScalarB,Index> &b,
-	     const MatrixBase<DerivedC> &c_const)
+void uscgemm(double alpha, const MatrixBase<DerivedA> &a,
+             const USCMatrix<ScalarB,Index> &b,
+             const MatrixBase<DerivedC> &c_const)
 {
-    UNCONST(DerivedC, c_const, c);
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == c.cols());
-
-    #pragma omp parallel for
-    for (Index k=0; k<b.cols(); k++)
-        for (Index r=0; r<b.indexes.rows(); r++)
-	{
-	    Index j = b.indexes(r,k);
-	    eigen_assert(j >= 0);
-	    eigen_assert(j < a.cols());
-	    c.col(k) += alpha * a.col(j) * b.values(r,k);
-	}
+  UNCONST(DerivedC, c_const, c);
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == c.cols());
+
+#pragma omp parallel for
+  for (Index k=0; k<b.cols(); k++)
+    for (Index r=0; r<b.indexes.rows(); r++)
+    {
+      Index j = b.indexes(r,k);
+      eigen_assert(j >= 0);
+      eigen_assert(j < a.cols());
+      c.col(k) += alpha * a.col(j) * b.values(r,k);
+    }
 }
 
 // sparse matrix - dense matrix product
 template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemm(double alpha, 
-	     const USCMatrix<ScalarA,Index> &a,
-	     const MatrixBase<DerivedB> &b, 
-	     const MatrixBase<DerivedC> &c_const)
+void uscgemm(double alpha,
+             const USCMatrix<ScalarA,Index> &a,
+             const MatrixBase<DerivedB> &b,
+             const MatrixBase<DerivedC> &c_const)
 {
-    UNCONST(DerivedC, c_const, c);
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == c.cols());
-
-    // This needs to be tuned for each system, unfortunately,
-    // and seems to vary a lot. A lot.
-    int i_blocks = omp_get_num_threads()*16;
-
-    // Assume only one block in k direction.
-    // We don't need to explicitly block in the j direction.
-    #pragma omp parallel for
-    for (Index ib=0; ib<i_blocks; ib++)
-        for (Index j=0; j<a.cols(); j++)
-	    for (Index r=0; r<a.indexes.rows(); r++)
-	    {
-	        Index i = a.indexes(r,j);
-		eigen_assert(i >= 0);
-		eigen_assert(i < c.rows());
-		if (i % i_blocks == ib)
-		    c.row(i) += alpha * a.values(r,j) * b.row(j);
-	    }
-
-    /*
+  UNCONST(DerivedC, c_const, c);
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == c.cols());
+
+  // This needs to be tuned for each system, unfortunately,
+  // and seems to vary a lot. A lot.
+  int i_blocks = omp_get_num_threads()*16;
+
+  // Assume only one block in k direction.
+  // We don't need to explicitly block in the j direction.
+#pragma omp parallel for
+  for (Index ib=0; ib<i_blocks; ib++)
+    for (Index j=0; j<a.cols(); j++)
+      for (Index r=0; r<a.indexes.rows(); r++)
+      {
+        Index i = a.indexes(r,j);
+        eigen_assert(i >= 0);
+        eigen_assert(i < c.rows());
+        if (i % i_blocks == ib)
+          c.row(i) += alpha * a.values(r,j) * b.row(j);
+      }
+
+  /*
     If c.cols() is really large, then theoretically it seems like we should do:
 
     parallel for blocks in i direction
-        for blocks in j direction
-            pack block of a into smaller sparse matrix
-            for blocks in k direction
-                for k
-                    for i (sparse)
-                        for j
-                            c(i,k) += a(i,j) * b(j,k)
+    for blocks in j direction
+    pack block of a into smaller sparse matrix
+    for blocks in k direction
+    for k
+    for i (sparse)
+    for j
+    c(i,k) += a(i,j) * b(j,k)
 
     However, the copying of blocks of a doesn't seem practical for any realistic
     sizes of c.cols().
-    */
+  */
 }
 
 // Dense matrix - dense matrix product, but masked by a sparse matrix,
@@ -147,45 +146,45 @@ void uscgemm(double alpha,
 
 template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index>
 void uscgemm_masked(double alpha,
-		    const MatrixBase<DerivedA> &a,
-		    const MatrixBase<DerivedB> &b,
-		    USCMatrix<ScalarC,Index> &c)
+                    const MatrixBase<DerivedA> &a,
+                    const MatrixBase<DerivedB> &b,
+                    USCMatrix<ScalarC,Index> &c)
 {
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == c.cols());
-
-    #pragma omp parallel for
-    for (Index k=0; k<b.cols(); k++)
-        for (Index r=0; r<c.indexes.rows(); r++)
-	{
-	    Index i = c.indexes(r, k);
-	    eigen_assert(i >= 0);
-	    eigen_assert(i < a.rows());
-	    c.values(r, k) += alpha * a.row(i) * b.col(k);
-	}
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == c.cols());
+
+#pragma omp parallel for
+  for (Index k=0; k<b.cols(); k++)
+    for (Index r=0; r<c.indexes.rows(); r++)
+    {
+      Index i = c.indexes(r, k);
+      eigen_assert(i >= 0);
+      eigen_assert(i < a.rows());
+      c.values(r, k) += alpha * a.row(i) * b.col(k);
+    }
 }
 
 // sparse matrix - dense vector product
 template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemv(double alpha, 
-	     const USCMatrix<ScalarA,Index> &a,
-	     const MatrixBase<DerivedB> &b,
-	     const MatrixBase<DerivedC> &c_const)
+void uscgemv(double alpha,
+             const USCMatrix<ScalarA,Index> &a,
+             const MatrixBase<DerivedB> &b,
+             const MatrixBase<DerivedC> &c_const)
 {
-    UNCONST(DerivedC, c_const, c);
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == 1 && c.cols() == 1);
-
-    for (Index j=0; j<a.cols(); j++)
-        for (Index r=0; r<a.indexes.rows(); r++)
-	{
-	    Index i = a.indexes(r,j);
-	    eigen_assert(i >= 0);
-	    eigen_assert(i < c.rows());
-	    c(i) += alpha * a.values(r,j) * b(j);
-	}
+  UNCONST(DerivedC, c_const, c);
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == 1 && c.cols() == 1);
+
+  for (Index j=0; j<a.cols(); j++)
+    for (Index r=0; r<a.indexes.rows(); r++)
+    {
+      Index i = a.indexes(r,j);
+      eigen_assert(i >= 0);
+      eigen_assert(i < c.rows());
+      c(i) += alpha * a.values(r,j) * b(j);
+    }
 }
 
 }
diff --git a/src/find_string.hpp b/src/find_string.hpp
new file mode 100644
index 0000000..d26f6fe
--- /dev/null
+++ b/src/find_string.hpp
@@ -0,0 +1,89 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+    find_string(boost::unordered_map<std::string, ...>, pair<char const*, char
+    const*>) pair is [begin, end), a key: map.find(std:string(key.first,
+    key.second)) read-only since unordered_map doesn't support lazy construction
+    of string from a pair key.
+
+    To the extent possible under law, the author(s) have dedicated all copyright
+    and related and neighboring rights to this software to the public domain
+    worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef FIND_STRING_GRAEHL_2015_06_24_HPP
+#define FIND_STRING_GRAEHL_2015_06_24_HPP
+#pragma once
+
+#include <utility>
+#include <algorithm>
+#include <cstddef>
+#include <boost/functional/hash.hpp>
+
+namespace std {
+/// we do not change standard semantics of any supported comparison e.g. pair vs
+/// pair, but simply allow string to be compared against pair of char pointers.
+inline bool operator==(std::string const& str, std::pair<char const*, char const*> slice) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char const*, char const*> slice, std::string const& str) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::string const& str, std::pair<char*, char*> slice) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char*, char*> slice, std::string const& str) {
+  return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+/// techinically not allowed but easiest route to ADL. we could rename these instead.
+inline std::size_t hash_value(std::pair<char const*, char const*> slice) {
+  return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::pair<char*, char*> slice) {
+  return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::string const& str) {
+  return boost::hash_range(str.begin(), str.end());
+}
+}
+
+struct slice_or_string_eq {
+  typedef bool result_type;
+  template <class A, class B>
+  bool operator()(A const& a, B const& b) const {
+    return a == b;
+  }
+};
+
+struct slice_or_string_hash {
+  typedef std::size_t result_type;
+  template <class Slice>
+  std::size_t operator()(Slice const& slice) const {
+    return hash_value(slice);
+  }
+};
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, Slice const& key) {
+  return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::iterator find_string(UnorderedMap& map, Slice const& key) {
+  return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, char const* key) {
+  return map.find(std::pair<char const*, char const*>(key, key+std::strlen(key)), slice_or_string_hash(), slice_or_string_eq());
+}
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::iterator find_string(UnorderedMap& map, char const* key) {
+  return map.find(std::pair<char const*, char const*>(key, key+std::strlen(key)), slice_or_string_hash(), slice_or_string_eq());
+}
+
+#endif
diff --git a/src/graphClasses.h b/src/graphClasses.h
index d3c0c4a..cd80a4c 100644
--- a/src/graphClasses.h
+++ b/src/graphClasses.h
@@ -3,7 +3,6 @@
 
 #include <cstdlib>
 #include "neuralClasses.h"
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 namespace nplm
@@ -11,50 +10,50 @@ namespace nplm
 
 template <class X>
 class Node {
-    public:
-        X * param; //what parameter is this
-        //vector <void *> children;
-        //vector <void *> parents;
-	Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
-	Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
-	int minibatch_size;
-
-    public:
-        Node() : param(NULL), minibatch_size(0) { }
-
-        Node(X *input_param, int minibatch_size)
-	  : param(input_param),
-	    minibatch_size(minibatch_size)
-        {
-	    resize(minibatch_size);
-        }
-
-	void resize(int minibatch_size)
-	{
-	    this->minibatch_size = minibatch_size;
-	    if (param->n_outputs() != -1)
-	    {
-	        fProp_matrix.setZero(param->n_outputs(), minibatch_size);
-	    }
-            if (param->n_inputs() != -1)
-            {
-	        bProp_matrix.setZero(param->n_inputs(), minibatch_size);
-            }
-	}
-
-	void resize() { resize(minibatch_size); }
-
-        /*
-        void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
-        {
-            param->fProp(input,fProp_matrix,0,0,n_cols);
-        }
-        void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
-        {
-            param->fProp(input,fProp_matrix,0,0,n_cols);
-        }
-        */
-        //for f prop, just call the fProp node of the particular parameter. 
+ public:
+  X * param; //what parameter is this
+  //vector <void *> children;
+  //vector <void *> parents;
+  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
+  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
+  int minibatch_size;
+
+ public:
+  Node() : param(NULL), minibatch_size(0) { }
+
+  Node(X *input_param, int minibatch_size)
+      : param(input_param),
+        minibatch_size(minibatch_size)
+  {
+    resize(minibatch_size);
+  }
+
+  void resize(int minibatch_size)
+  {
+    this->minibatch_size = minibatch_size;
+    if (param->n_outputs() != -1)
+    {
+      fProp_matrix.setZero(param->n_outputs(), minibatch_size);
+    }
+    if (param->n_inputs() != -1)
+    {
+      bProp_matrix.setZero(param->n_inputs(), minibatch_size);
+    }
+  }
+
+  void resize() { resize(minibatch_size); }
+
+  /*
+    void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
+    {
+    param->fProp(input,fProp_matrix,0,0,n_cols);
+    }
+    void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
+    {
+    param->fProp(input,fProp_matrix,0,0,n_cols);
+    }
+  */
+  //for f prop, just call the fProp node of the particular parameter.
 
 };
 
diff --git a/src/model.cpp b/src/model.cpp
index 3767f4b..db7f006 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -13,295 +13,295 @@ namespace nplm
 {
 
 void model::resize(int ngram_size,
-    int input_vocab_size,
-    int output_vocab_size,
-    int input_embedding_dimension,
-    int num_hidden,
-    int output_embedding_dimension)
+                   int input_vocab_size,
+                   int output_vocab_size,
+                   int input_embedding_dimension,
+                   int num_hidden,
+                   int output_embedding_dimension)
 {
-    input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
-    if (num_hidden == 0)
-    {
-        first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
-        first_hidden_activation.resize(output_embedding_dimension);
-        second_hidden_linear.resize(1,1);
-        second_hidden_activation.resize(1);
-    }
-    else
-    {
-        first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
-        first_hidden_activation.resize(num_hidden);
-        second_hidden_linear.resize(output_embedding_dimension, num_hidden);
-        second_hidden_activation.resize(output_embedding_dimension);
-    }
-    output_layer.resize(output_vocab_size, output_embedding_dimension);
-    this->ngram_size = ngram_size;
-    this->input_vocab_size = input_vocab_size;
-    this->output_vocab_size = output_vocab_size;
-    this->input_embedding_dimension = input_embedding_dimension;
-    this->num_hidden = num_hidden;
-    this->output_embedding_dimension = output_embedding_dimension;
-    premultiplied = false;
+  input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
+  if (num_hidden == 0)
+  {
+    first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
+    first_hidden_activation.resize(output_embedding_dimension);
+    second_hidden_linear.resize(1,1);
+    second_hidden_activation.resize(1);
+  }
+  else
+  {
+    first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+    first_hidden_activation.resize(num_hidden);
+    second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+    second_hidden_activation.resize(output_embedding_dimension);
+  }
+  output_layer.resize(output_vocab_size, output_embedding_dimension);
+  this->ngram_size = ngram_size;
+  this->input_vocab_size = input_vocab_size;
+  this->output_vocab_size = output_vocab_size;
+  this->input_embedding_dimension = input_embedding_dimension;
+  this->num_hidden = num_hidden;
+  this->output_embedding_dimension = output_embedding_dimension;
+  premultiplied = false;
 }
-  
-void model::initialize(mt19937 &init_engine,
-    bool init_normal,
-    double init_range,
-    double init_bias,
-    string &parameter_update,
-    double adagrad_epsilon)
+
+void model::initialize(boost::random::mt19937 &init_engine,
+                       bool init_normal,
+                       double init_range,
+                       double init_bias,
+                       string &parameter_update,
+                       double adagrad_epsilon)
 {
-    input_layer.initialize(init_engine,
-        init_normal,
-        init_range,
-        parameter_update,
-        adagrad_epsilon);
-    output_layer.initialize(init_engine,
-        init_normal,
-        init_range,
-        init_bias,
-        parameter_update,
-        adagrad_epsilon);
-    first_hidden_linear.initialize(init_engine,
-        init_normal,
-        init_range,
-        parameter_update,
-        adagrad_epsilon);
-    second_hidden_linear.initialize(init_engine,
-        init_normal,
-        init_range,
-        parameter_update,
-        adagrad_epsilon);
+  input_layer.initialize(init_engine,
+                         init_normal,
+                         init_range,
+                         parameter_update,
+                         adagrad_epsilon);
+  output_layer.initialize(init_engine,
+                          init_normal,
+                          init_range,
+                          init_bias,
+                          parameter_update,
+                          adagrad_epsilon);
+  first_hidden_linear.initialize(init_engine,
+                                 init_normal,
+                                 init_range,
+                                 parameter_update,
+                                 adagrad_epsilon);
+  second_hidden_linear.initialize(init_engine,
+                                  init_normal,
+                                  init_range,
+                                  parameter_update,
+                                  adagrad_epsilon);
 }
 
 void model::premultiply()
 {
-    // Since input and first_hidden_linear are both linear,
-    // we can multiply them into a single linear layer *if* we are not training
-    int context_size = ngram_size-1;
-    Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
-    if (num_hidden == 0)
-    {
-        first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
-    }
-    else
-    {
-        first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
-    }
-    for (int i=0; i<context_size; i++)
-        first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
-    input_layer.W->resize(1,1); // try to save some memory
-    premultiplied = true;
+  // Since input and first_hidden_linear are both linear,
+  // we can multiply them into a single linear layer *if* we are not training
+  int context_size = ngram_size-1;
+  Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
+  if (num_hidden == 0)
+  {
+    first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
+  }
+  else
+  {
+    first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+  }
+  for (int i=0; i<context_size; i++)
+    first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
+  input_layer.W->resize(1,1); // try to save some memory
+  premultiplied = true;
 }
 
 void model::readConfig(ifstream &config_file)
 {
-    string line;
-    vector<string> fields;
-    int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
-    activation_function_type activation_function = this->activation_function;
-    while (getline(config_file, line) && line != "")
+  string line;
+  vector<string> fields;
+  int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
+  activation_function_type activation_function = this->activation_function;
+  while (getline(config_file, line) && line != "")
+  {
+    splitBySpace(line, fields);
+    if (fields[0] == "ngram_size")
+      ngram_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "vocab_size")
+      input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "input_vocab_size")
+      input_vocab_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "output_vocab_size")
+      output_vocab_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "input_embedding_dimension")
+      input_embedding_dimension = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "num_hidden")
+      num_hidden = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "output_embedding_dimension")
+      output_embedding_dimension = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "activation_function")
+      activation_function = string_to_activation_function(fields[1]);
+    else if (fields[0] == "version")
     {
-        splitBySpace(line, fields);
-	if (fields[0] == "ngram_size")
-	    ngram_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "vocab_size")
-	    input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "input_vocab_size")
-	    input_vocab_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "output_vocab_size")
-	    output_vocab_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "input_embedding_dimension")
-	    input_embedding_dimension = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "num_hidden")
-	    num_hidden = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "output_embedding_dimension")
-	    output_embedding_dimension = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "activation_function")
-	    activation_function = string_to_activation_function(fields[1]);
-	else if (fields[0] == "version")
-	{
-	    int version = lexical_cast<int>(fields[1]);
-	    if (version != 1)
-	    {
-		cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
-		exit(1);
-	    }
-	}
-	else
-	    cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+      int version = lexical_cast<int>(fields[1]);
+      if (version != 1)
+      {
+        cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
+        exit(1);
+      }
     }
-    resize(ngram_size,
-        input_vocab_size,
-        output_vocab_size,
-        input_embedding_dimension,
-        num_hidden,
-        output_embedding_dimension);
-    set_activation_function(activation_function);
+    else
+      cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+  }
+  resize(ngram_size,
+         input_vocab_size,
+         output_vocab_size,
+         input_embedding_dimension,
+         num_hidden,
+         output_embedding_dimension);
+  set_activation_function(activation_function);
 }
 
 void model::readConfig(const string &filename)
 {
-    ifstream config_file(filename.c_str());
-    if (!config_file)
-    {
-        cerr << "error: could not open config file " << filename << endl;
-	exit(1);
-    }
-    readConfig(config_file);
-    config_file.close();
+  ifstream config_file(filename.c_str());
+  if (!config_file)
+  {
+    cerr << "error: could not open config file " << filename << endl;
+    exit(1);
+  }
+  readConfig(config_file);
+  config_file.close();
 }
- 
+
 void model::read(const string &filename)
 {
-    vector<string> input_words;
-    vector<string> output_words;
-    read(filename, input_words, output_words);
+  vector<string> input_words;
+  vector<string> output_words;
+  read(filename, input_words, output_words);
 }
 
 void model::read(const string &filename, vector<string> &words)
 {
-    vector<string> output_words;
-    read(filename, words, output_words);
+  vector<string> output_words;
+  read(filename, words, output_words);
 }
 
 void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words)
 {
-    ifstream file(filename.c_str());
-    if (!file) throw runtime_error("Could not open file " + filename);
-    
-    param myParam;
-    string line;
-    
-    while (getline(file, line))
+  ifstream file(filename.c_str());
+  if (!file) throw runtime_error("Could not open file " + filename);
+
+  param myParam;
+  string line;
+
+  while (getline(file, line))
+  {
+    if (line == "\\config")
+    {
+      readConfig(file);
+    }
+
+    else if (line == "\\vocab")
+    {
+      input_words.clear();
+      readWordsFile(file, input_words);
+      output_words = input_words;
+    }
+
+    else if (line == "\\input_vocab")
     {
-	if (line == "\\config")
-	{
-	    readConfig(file);
-	}
-
-	else if (line == "\\vocab")
-	{
-	    input_words.clear();
-	    readWordsFile(file, input_words);
-	    output_words = input_words;
-	}
-
-	else if (line == "\\input_vocab")
-	{
-	    input_words.clear();
-	    readWordsFile(file, input_words);
-	}
-
-	else if (line == "\\output_vocab")
-	{
-	    output_words.clear();
-	    readWordsFile(file, output_words);
-	}
-
-	else if (line == "\\input_embeddings")
-	    input_layer.read(file);
-	else if (line == "\\hidden_weights 1")
-	    first_hidden_linear.read_weights(file);
-	else if (line == "\\hidden_biases 1")
-	    first_hidden_linear.read_biases (file);
-	else if (line == "\\hidden_weights 2")
-	    second_hidden_linear.read_weights(file);
-	else if (line == "\\hidden_biases 2")
-	    second_hidden_linear.read_biases (file);
-	else if (line == "\\output_weights")
-	    output_layer.read_weights(file);
-	else if (line == "\\output_biases")
-	    output_layer.read_biases(file);
-	else if (line == "\\end")
-	    break;
-	else if (line == "")
-	    continue;
-	else
-	{
-	    cerr << "warning: unrecognized section: " << line << endl;
-	    // skip over section
-	    while (getline(file, line) && line != "") { }
-	}
+      input_words.clear();
+      readWordsFile(file, input_words);
     }
-    file.close();
+
+    else if (line == "\\output_vocab")
+    {
+      output_words.clear();
+      readWordsFile(file, output_words);
+    }
+
+    else if (line == "\\input_embeddings")
+      input_layer.read(file);
+    else if (line == "\\hidden_weights 1")
+      first_hidden_linear.read_weights(file);
+    else if (line == "\\hidden_biases 1")
+      first_hidden_linear.read_biases (file);
+    else if (line == "\\hidden_weights 2")
+      second_hidden_linear.read_weights(file);
+    else if (line == "\\hidden_biases 2")
+      second_hidden_linear.read_biases (file);
+    else if (line == "\\output_weights")
+      output_layer.read_weights(file);
+    else if (line == "\\output_biases")
+      output_layer.read_biases(file);
+    else if (line == "\\end")
+      break;
+    else if (line == "")
+      continue;
+    else
+    {
+      cerr << "warning: unrecognized section: " << line << endl;
+      // skip over section
+      while (getline(file, line) && line != "") { }
+    }
+  }
+  file.close();
 }
 
 void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
-{ 
-    write(filename, &input_words, &output_words);
+{
+  write(filename, &input_words, &output_words);
 }
 
 void model::write(const string &filename, const vector<string> &words)
-{ 
-    write(filename, &words, NULL);
+{
+  write(filename, &words, NULL);
 }
 
-void model::write(const string &filename) 
-{ 
-    write(filename, NULL, NULL);
+void model::write(const string &filename)
+{
+  write(filename, NULL, NULL);
 }
 
 void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
 {
-    ofstream file(filename.c_str());
-    if (!file) throw runtime_error("Could not open file " + filename);
-    
-    file << "\\config" << endl;
-    file << "version 1" << endl;
-    file << "ngram_size " << ngram_size << endl;
-    file << "input_vocab_size " << input_vocab_size << endl;
-    file << "output_vocab_size " << output_vocab_size << endl;
-    file << "input_embedding_dimension " << input_embedding_dimension << endl;
-    file << "num_hidden " << num_hidden << endl;
-    file << "output_embedding_dimension " << output_embedding_dimension << endl;
-    file << "activation_function " << activation_function_to_string(activation_function) << endl;
-    file << endl;
-    
-    if (input_pwords)
-    {
-        file << "\\input_vocab" << endl;
-	writeWordsFile(*input_pwords, file);
-	file << endl;
-    }
+  ofstream file(filename.c_str());
+  if (!file) throw runtime_error("Could not open file " + filename);
 
-    if (output_pwords)
-    {
-        file << "\\output_vocab" << endl;
-	writeWordsFile(*output_pwords, file);
-	file << endl;
-    }
+  file << "\\config" << endl;
+  file << "version 1" << endl;
+  file << "ngram_size " << ngram_size << endl;
+  file << "input_vocab_size " << input_vocab_size << endl;
+  file << "output_vocab_size " << output_vocab_size << endl;
+  file << "input_embedding_dimension " << input_embedding_dimension << endl;
+  file << "num_hidden " << num_hidden << endl;
+  file << "output_embedding_dimension " << output_embedding_dimension << endl;
+  file << "activation_function " << activation_function_to_string(activation_function) << endl;
+  file << endl;
 
-    file << "\\input_embeddings" << endl;
-    input_layer.write(file);
-    file << endl;
-    
-    file << "\\hidden_weights 1" << endl;
-    first_hidden_linear.write_weights(file);
+  if (input_pwords)
+  {
+    file << "\\input_vocab" << endl;
+    writeWordsFile(*input_pwords, file);
     file << endl;
+  }
 
-    file << "\\hidden_biases 1" << endl;
-    first_hidden_linear.write_biases(file);
-    file <<endl;
-    
-    file << "\\hidden_weights 2" << endl;
-    second_hidden_linear.write_weights(file);
+  if (output_pwords)
+  {
+    file << "\\output_vocab" << endl;
+    writeWordsFile(*output_pwords, file);
     file << endl;
+  }
 
-    file << "\\hidden_biases 2" << endl;
-    second_hidden_linear.write_biases(file);
-    file << endl;
-    
-    file << "\\output_weights" << endl;
-    output_layer.write_weights(file);
-    file << endl;
-    
-    file << "\\output_biases" << endl;
-    output_layer.write_biases(file);
-    file << endl;
-    
-    file << "\\end" << endl;
-    file.close();
+  file << "\\input_embeddings" << endl;
+  input_layer.write(file);
+  file << endl;
+
+  file << "\\hidden_weights 1" << endl;
+  first_hidden_linear.write_weights(file);
+  file << endl;
+
+  file << "\\hidden_biases 1" << endl;
+  first_hidden_linear.write_biases(file);
+  file <<endl;
+
+  file << "\\hidden_weights 2" << endl;
+  second_hidden_linear.write_weights(file);
+  file << endl;
+
+  file << "\\hidden_biases 2" << endl;
+  second_hidden_linear.write_biases(file);
+  file << endl;
+
+  file << "\\output_weights" << endl;
+  output_layer.write_weights(file);
+  file << endl;
+
+  file << "\\output_biases" << endl;
+  output_layer.write_biases(file);
+  file << endl;
+
+  file << "\\end" << endl;
+  file.close();
 }
 
 
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 26dae06..458f80e 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -6,8 +6,7 @@
 #include <cmath>
 #include <vector>
 
-#include <boost/unordered_map.hpp> 
-//#include <../3rdparty/Eigen/Dense>
+#include <boost/unordered_map.hpp>
 #include <Eigen/Dense>
 #include "maybe_omp.h"
 
@@ -35,7 +34,7 @@ using Eigen::Dynamic;
 typedef boost::unordered_map<int,bool> int_map;
 
 struct Clipper{
-  double operator() (double x) const { 
+  double operator() (double x) const {
     return std::min(0.5, std::max(x,-0.5));
     //return(x);
   }
@@ -44,978 +43,997 @@ struct Clipper{
 
 class Linear_layer
 {
-    private: 
-        Matrix<double,Dynamic,Dynamic> U;
-        Matrix<double,Dynamic,Dynamic> U_gradient;
-        Matrix<double,Dynamic,Dynamic> U_velocity;
-        Matrix<double,Dynamic,Dynamic> U_running_gradient;
-        Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
-        // Biases
-        Matrix<double,Dynamic,1> b;
-        Matrix<double,Dynamic,1> b_velocity;
-        Matrix<double,Dynamic,1> b_running_gradient;
-        Matrix<double,Dynamic,1> b_running_parameter_update;
-        Matrix<double,Dynamic,1> b_gradient;
-
-    friend class model;
-
-    public:
-	Linear_layer() { }
-        Linear_layer(int rows, int cols) { resize(rows, cols); }
-
-	void resize(int rows, int cols)
-	{
-	    U.setZero(rows, cols);
-      U_gradient.setZero(rows, cols);
-      //U_running_gradient.setZero(rows, cols);
-      //U_running_parameter_updates.setZero(rows, cols);
-      //U_velocity.setZero(rows, cols);
-      b.resize(rows);
-      b_gradient.setZero(rows);
-      //b_running_gradient.resize(rows);
-      //b_velocity.resize(rows);
-	}
-
-	void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
-	void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
+ private:
+  Matrix<double,Dynamic,Dynamic> U;
+  Matrix<double,Dynamic,Dynamic> U_gradient;
+  Matrix<double,Dynamic,Dynamic> U_velocity;
+  Matrix<double,Dynamic,Dynamic> U_running_gradient;
+  Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+  // Biases
+  Matrix<double,Dynamic,1> b;
+  Matrix<double,Dynamic,1> b_velocity;
+  Matrix<double,Dynamic,1> b_running_gradient;
+  Matrix<double,Dynamic,1> b_running_parameter_update;
+  Matrix<double,Dynamic,1> b_gradient;
+
+  friend class model;
+
+ public:
+  Linear_layer() { }
+  Linear_layer(int rows, int cols) { resize(rows, cols); }
+
+  void resize(int rows, int cols)
+  {
+    U.setZero(rows, cols);
+    U_gradient.setZero(rows, cols);
+    //U_running_gradient.setZero(rows, cols);
+    //U_running_parameter_updates.setZero(rows, cols);
+    //U_velocity.setZero(rows, cols);
+    b.resize(rows);
+    b_gradient.setZero(rows);
+    //b_running_gradient.resize(rows);
+    //b_velocity.resize(rows);
+  }
+
+  void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
+  void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
   void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
   void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
 
 
-	template <typename Engine>
-	void initialize(Engine &engine,
-      bool init_normal,
-      double init_range,
-      string &parameter_update,
-      double adagrad_epsilon)
-	{
-      if (parameter_update == "ADA") {
-        U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
-        b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
-      }
-      if (parameter_update == "ADAD") {
-        U_running_gradient.setZero(U.rows(),U.cols());
-        b_running_gradient.setZero(b.size());
-        U_running_parameter_update.setZero(U.rows(),U.cols());
-        b_running_parameter_update.setZero(b.size());
-      }
+  template <typename Engine>
+  void initialize(Engine &engine,
+                  bool init_normal,
+                  double init_range,
+                  string &parameter_update,
+                  double adagrad_epsilon)
+  {
+    if (parameter_update == "ADA") {
+      U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+      b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+    }
+    if (parameter_update == "ADAD") {
+      U_running_gradient.setZero(U.rows(),U.cols());
+      b_running_gradient.setZero(b.size());
+      U_running_parameter_update.setZero(U.rows(),U.cols());
+      b_running_parameter_update.setZero(b.size());
+    }
 
-	    initMatrix(engine, U, init_normal, init_range);
-      initBias(engine, b, init_normal, init_range);
-	}	  
+    initMatrix(engine, U, init_normal, init_range);
+    initBias(engine, b, init_normal, init_range);
+  }
 
-	int n_inputs () const { return U.cols(); }
-	int n_outputs () const { return U.rows(); }
+  int n_inputs () const { return U.cols(); }
+  int n_outputs () const { return U.rows(); }
 
   template <typename DerivedIn, typename DerivedOut>
-	void fProp(const MatrixBase<DerivedIn> &input,
-      const MatrixBase<DerivedOut> &output) const
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOut> &output) const
   {
-      UNCONST(DerivedOut, output, my_output);
-      my_output.leftCols(input.cols()).noalias() = U*input;
-      int num_examples = input.cols();
-      for (int example = 0;example < num_examples;example++) 
-      {
-          my_output.leftCols(input.cols()).col(example) += b;
-      }
+    UNCONST(DerivedOut, output, my_output);
+    my_output.leftCols(input.cols()).noalias() = U*input;
+    int num_examples = input.cols();
+    for (int example = 0;example < num_examples;example++)
+    {
+      my_output.leftCols(input.cols()).col(example) += b;
+    }
   }
 
-	// Sparse input
+  // Sparse input
   template <typename ScalarIn, typename DerivedOut>
-	void fProp(const USCMatrix<ScalarIn> &input,
-      const MatrixBase<DerivedOut> &output_const) const
-  {
-	    UNCONST(DerivedOut, output_const, output);
-	    output.setZero();
-	    uscgemm(1.0, U, input, output.leftCols(input.cols()));
-      // Each column corresponds to a training example. We 
-      // parallelize the adding of biases per dimension.
-      int num_examples = input.cols();
-      for (int example = 0;example < num_examples;example++) 
-      {
-          output.leftCols(input.cols()).col(example) += b;
-      }
+  void fProp(const USCMatrix<ScalarIn> &input,
+             const MatrixBase<DerivedOut> &output_const) const
+  {
+    UNCONST(DerivedOut, output_const, output);
+    output.setZero();
+    uscgemm(1.0, U, input, output.leftCols(input.cols()));
+    // Each column corresponds to a training example. We
+    // parallelize the adding of biases per dimension.
+    int num_examples = input.cols();
+    for (int example = 0;example < num_examples;example++)
+    {
+      output.leftCols(input.cols()).col(example) += b;
+    }
   }
 
   template <typename DerivedGOut, typename DerivedGIn>
-	void bProp(const MatrixBase<DerivedGOut> &input,
-      MatrixBase<DerivedGIn> &output) const
+  void bProp(const MatrixBase<DerivedGOut> &input,
+             MatrixBase<DerivedGIn> &output) const
   {
-	    UNCONST(DerivedGIn, output, my_output);
-	    my_output.noalias() = U.transpose()*input;
-	}
+    UNCONST(DerivedGIn, output, my_output);
+    my_output.noalias() = U.transpose()*input;
+  }
 
   template <typename DerivedGOut, typename DerivedIn>
-  void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, 
-     const MatrixBase<DerivedIn> &fProp_input, 
-     double learning_rate, double momentum, double L2_reg)
-  {
-      U_gradient.noalias() = bProp_input*fProp_input.transpose();
-      
-      // get the bias gradient for all dimensions in parallel
-      int size = b.size();
-      b_gradient = bProp_input.rowwise().sum();
-      // This used to be multithreaded, but there was no measureable difference
-      if (L2_reg > 0.0)
-      {
-          U_gradient -=  2*L2_reg*U;
-          b_gradient -= 2*L2_reg*b;
-      }
-      if (momentum > 0.0)
-      {
-          U_velocity = momentum*U_velocity + U_gradient;
-          U += learning_rate * U_velocity;
-          b_velocity = momentum*b_velocity + b_gradient;
-          b += learning_rate * b_velocity;
-      }
-      else
-      {
-          U += learning_rate * U_gradient;
-          b += learning_rate * b_gradient;
-          /* 
-          //UPDATE CLIPPING
-          U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
-          b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
-          //GRADIENT CLIPPING
-          //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
-          //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
-          */
-      }
-	}
+  void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
+                        const MatrixBase<DerivedIn> &fProp_input,
+                        double learning_rate, double momentum, double L2_reg)
+  {
+    U_gradient.noalias() = bProp_input*fProp_input.transpose();
+
+    // get the bias gradient for all dimensions in parallel
+    int size = b.size();
+    b_gradient = bProp_input.rowwise().sum();
+    // This used to be multithreaded, but there was no measureable difference
+    if (L2_reg > 0.0)
+    {
+      U_gradient -=  2*L2_reg*U;
+      b_gradient -= 2*L2_reg*b;
+    }
+    if (momentum > 0.0)
+    {
+      U_velocity = momentum*U_velocity + U_gradient;
+      U += learning_rate * U_velocity;
+      b_velocity = momentum*b_velocity + b_gradient;
+      b += learning_rate * b_velocity;
+    }
+    else
+    {
+      U += learning_rate * U_gradient;
+      b += learning_rate * b_gradient;
+      /*
+      //UPDATE CLIPPING
+      U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+      b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+      //GRADIENT CLIPPING
+      //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+      //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+      */
+    }
+  }
 
   template <typename DerivedGOut, typename DerivedIn>
-  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
-      const MatrixBase<DerivedIn> &fProp_input, 
-      double learning_rate,
-      double L2_reg)
+  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+                              const MatrixBase<DerivedIn> &fProp_input,
+                              double learning_rate,
+                              double L2_reg)
   {
-      U_gradient.noalias() = bProp_input*fProp_input.transpose();
+    U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-      
-      // get the bias gradient for all dimensions in parallel
-      int size = b.size();
-      b_gradient.noalias() = bProp_input.rowwise().sum();
 
-      if (L2_reg != 0)
-      {
-          U_gradient -=  2*L2_reg*U;
-          b_gradient -= 2*L2_reg*b;
-      }
+    // get the bias gradient for all dimensions in parallel
+    int size = b.size();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
 
-      // ignore momentum?
-      #pragma omp parallel for
-      for (int col=0; col<U.cols(); col++) {
-        U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
-        U.col(col) += learning_rate * (U_gradient.col(col).array() / 
-                  U_running_gradient.col(col).array().sqrt()).matrix();
-        /*
-        //UPDATE CLIPPING
-        U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
-              unaryExpr(Clipper()).matrix();
-        */
-      }
-      b_running_gradient += b_gradient.array().square().matrix();
-      b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+    if (L2_reg != 0)
+    {
+      U_gradient -=  2*L2_reg*U;
+      b_gradient -= 2*L2_reg*b;
+    }
+
+    // ignore momentum?
+#pragma omp parallel for
+    for (int col=0; col<U.cols(); col++) {
+      U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+      U.col(col) += learning_rate * (U_gradient.col(col).array() /
+                                     U_running_gradient.col(col).array().sqrt()).matrix();
       /*
       //UPDATE CLIPPING
-      b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+      U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+      unaryExpr(Clipper()).matrix();
       */
+    }
+    b_running_gradient += b_gradient.array().square().matrix();
+    b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+    /*
+    //UPDATE CLIPPING
+    b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+    */
   }
 
   template <typename DerivedGOut, typename DerivedIn>
-  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, 
-      const MatrixBase<DerivedIn> &fProp_input, 
-      double learning_rate,
-      double L2_reg,
-      double conditioning_constant,
-      double decay)
-  {
-      //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
-      U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
-      Array<double,Dynamic,1> b_current_parameter_update;
-      
-      // get the bias gradient for all dimensions in parallel
-      int size = b.size();
-      b_gradient.noalias() = bProp_input.rowwise().sum();
-
-      if (L2_reg != 0)
-      {
-          U_gradient -=  2*L2_reg*U;
-          b_gradient -= 2*L2_reg*b;
-      }
+  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+                               const MatrixBase<DerivedIn> &fProp_input,
+                               double learning_rate,
+                               double L2_reg,
+                               double conditioning_constant,
+                               double decay)
+  {
+    //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+    U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-      // ignore momentum?
-      #pragma omp parallel for
-      //cerr<<"U gradient is "<<U_gradient<<endl;
-      for (int col=0; col<U.cols(); col++) {
-        Array<double,Dynamic,1> U_current_parameter_update;
-        U_running_gradient.col(col) = decay*U_running_gradient.col(col) + 
-                            (1-decay)*U_gradient.col(col).array().square().matrix();
-        //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
-        //getchar();
-        U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
-                                      (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
-                                      U_gradient.col(col).array();
-        //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
-        //getchar();
-        //update the running parameter update
-        U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
-                                          (1.-decay)*U_current_parameter_update.square().matrix();
-        U.col(col) += learning_rate*U_current_parameter_update.matrix();  
-      }
-      b_running_gradient = decay*b_running_gradient + 
-                        (1.-decay)*b_gradient.array().square().matrix();
-      b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
-                                   (b_running_gradient.array()+conditioning_constant).sqrt()) *
-                                  b_gradient.array();
-      b_running_parameter_update = decay*(b_running_parameter_update) + 
-                                (1.-decay)*b_current_parameter_update.square().matrix();
-      b += learning_rate*b_current_parameter_update.matrix();
+    Array<double,Dynamic,1> b_current_parameter_update;
+
+    // get the bias gradient for all dimensions in parallel
+    int size = b.size();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
+
+    if (L2_reg != 0)
+    {
+      U_gradient -=  2*L2_reg*U;
+      b_gradient -= 2*L2_reg*b;
+    }
+
+    // ignore momentum?
+#pragma omp parallel for
+    //cerr<<"U gradient is "<<U_gradient<<endl;
+    for (int col=0; col<U.cols(); col++) {
+      Array<double,Dynamic,1> U_current_parameter_update;
+      U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
+          (1-decay)*U_gradient.col(col).array().square().matrix();
+      //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+      //getchar();
+      U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+                                    (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+          U_gradient.col(col).array();
+      //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+      //getchar();
+      //update the running parameter update
+      U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+          (1.-decay)*U_current_parameter_update.square().matrix();
+      U.col(col) += learning_rate*U_current_parameter_update.matrix();
+    }
+    b_running_gradient = decay*b_running_gradient +
+        (1.-decay)*b_gradient.array().square().matrix();
+    b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                  (b_running_gradient.array()+conditioning_constant).sqrt()) *
+        b_gradient.array();
+    b_running_parameter_update = decay*(b_running_parameter_update) +
+        (1.-decay)*b_current_parameter_update.square().matrix();
+    b += learning_rate*b_current_parameter_update.matrix();
   }
 
 
   template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
-    const MatrixBase<DerivedIn> &fProp_input, 
-    const MatrixBase<DerivedGW> &gradient) const
+  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+                            const MatrixBase<DerivedIn> &fProp_input,
+                            const MatrixBase<DerivedGW> &gradient) const
   {
-      UNCONST(DerivedGW, gradient, my_gradient);
-      my_gradient.noalias() = bProp_input*fProp_input.transpose();
+    UNCONST(DerivedGW, gradient, my_gradient);
+    my_gradient.noalias() = bProp_input*fProp_input.transpose();
   }
 };
 
 class Output_word_embeddings
 {
-    private:
-        // row-major is better for uscgemm
-        //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
-        // Having W be a pointer to a matrix allows ease of sharing
-        // input and output word embeddings
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
-        std::vector<double> W_data;
-        Matrix<double,Dynamic,1> b;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
-        Matrix<double,Dynamic,1> b_running_gradient;
-        Matrix<double,Dynamic,1> b_gradient;
-        Matrix<double,Dynamic,1> b_running_parameter_update;
-
-    public:
-        Output_word_embeddings() { }
-        Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
-
-        void resize(int rows, int cols)
-        {
-          W->setZero(rows, cols);
-          b.setZero(rows);
-        }
-    void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
-      W = input_W;
-    }
-    void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
-    void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
-    void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
-    void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
-
-    template <typename Engine>
-    void initialize(Engine &engine,
-        bool init_normal,
-        double init_range,
-        double init_bias,
-        string &parameter_update,
-        double adagrad_epsilon)
-    {
+ private:
+  // row-major is better for uscgemm
+  //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
+  // Having W be a pointer to a matrix allows ease of sharing
+  // input and output word embeddings
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+  std::vector<double> W_data;
+  Matrix<double,Dynamic,1> b;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+  Matrix<double,Dynamic,1> b_running_gradient;
+  Matrix<double,Dynamic,1> b_gradient;
+  Matrix<double,Dynamic,1> b_running_parameter_update;
+
+ public:
+  Output_word_embeddings() { }
+  Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
+
+  void resize(int rows, int cols)
+  {
+    W->setZero(rows, cols);
+    b.setZero(rows);
+  }
+  void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+    W = input_W;
+  }
+  void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
+  void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+  void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+  void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
 
-        W_gradient.setZero(W->rows(),W->cols());
-        b_gradient.setZero(b.size());
-        if (parameter_update == "ADA") {
-          W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
-          b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
-          //W_gradient.setZero(W->rows(),W->cols());
-          //b_gradient.setZero(b.size());
-        }
-        if (parameter_update == "ADAD") {
-          W_running_gradient.setZero(W->rows(),W->cols());
-          b_running_gradient.setZero(b.size());
-          W_gradient.setZero(W->rows(),W->cols());
-          //b_gradient.setZero(b.size());
-          //W_running_parameter_update.setZero(W->rows(),W->cols());
-          b_running_parameter_update.setZero(b.size());
-        }
-
-        initMatrix(engine, *W, init_normal, init_range);
-        b.fill(init_bias);
-    }
-
-    int n_inputs () const { return W->cols(); }
-    int n_outputs () const { return W->rows(); }
-
-    template <typename DerivedIn, typename DerivedOut>
-    void fProp(const MatrixBase<DerivedIn> &input,
-    const MatrixBase<DerivedOut> &output) const
-	  {
-        UNCONST(DerivedOut, output, my_output);
-        my_output = ((*W) * input).colwise() + b;
-	  }
-
-	// Sparse output version
-    template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
-    void fProp(const MatrixBase<DerivedIn> &input,
-    const MatrixBase<DerivedOutI> &samples,
-    const MatrixBase<DerivedOutV> &output) const
-	  {
-        UNCONST(DerivedOutV, output, my_output);
-        #pragma omp parallel for
-        for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
-        {
-          for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
-          {
-            my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
-          }
-        }
-        USCMatrix<double> sparse_output(W->rows(), samples, my_output);
-        uscgemm_masked(1.0, *W, input, sparse_output);
-        my_output = sparse_output.values; // too bad, so much copying
-	  }
-
-    // Return single element of output matrix
-    template <typename DerivedIn>
-    double fProp(const MatrixBase<DerivedIn> &input, 
-           int word,
-           int instance) const 
-    {
-        return W->row(word).dot(input.col(instance)) + b(word);
+  template <typename Engine>
+  void initialize(Engine &engine,
+                  bool init_normal,
+                  double init_range,
+                  double init_bias,
+                  string &parameter_update,
+                  double adagrad_epsilon)
+  {
+
+    W_gradient.setZero(W->rows(),W->cols());
+    b_gradient.setZero(b.size());
+    if (parameter_update == "ADA") {
+      W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+      b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+      //W_gradient.setZero(W->rows(),W->cols());
+      //b_gradient.setZero(b.size());
+    }
+    if (parameter_update == "ADAD") {
+      W_running_gradient.setZero(W->rows(),W->cols());
+      b_running_gradient.setZero(b.size());
+      W_gradient.setZero(W->rows(),W->cols());
+      //b_gradient.setZero(b.size());
+      //W_running_parameter_update.setZero(W->rows(),W->cols());
+      b_running_parameter_update.setZero(b.size());
     }
 
-    // Dense versions (for log-likelihood loss)
+    initMatrix(engine, *W, init_normal, init_range);
+    b.fill(init_bias);
+  }
+
+  int n_inputs () const { return W->cols(); }
+  int n_outputs () const { return W->rows(); }
 
-    template <typename DerivedGOut, typename DerivedGIn>
-    void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
-    const MatrixBase<DerivedGIn> &bProp_matrix) const
+  template <typename DerivedIn, typename DerivedOut>
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOut> &output) const
+  {
+    UNCONST(DerivedOut, output, my_output);
+    my_output = ((*W) * input).colwise() + b;
+    /* TODO: without EIGEN_NO_DEBUG - is this a bug?
+       ProductBase.h:102: Eigen::ProductBase<Derived, Lhs, Rhs>::ProductBase(const Lhs&
+       , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<double, -1, -1
+       , 1>, Eigen::Matrix<double, -1, -1>, 5>; Lhs = Eigen::Matrix<double, -1, -1, 1>;
+        Rhs = Eigen::Matrix<double, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() &
+       & "invalid matrix product" && "if you wanted a coeff-wise or a dot product use t
+       he respective explicit functions"' failed.
+
+       (gdb) p a_lhs.cols()
+       $3 = 50
+       (gdb) p	a_rhs.rows()
+       $4 = 100
+
+       (gdb) p a_lhs.rows()
+       $5 = 2
+       (gdb) p a_rhs.cols()
+       $6 = 1
+
+       from lookup_ngram normalization prop.skip_hidden in neuralNetwork.h:100
+    */
+  }
+
+  // Sparse output version
+  template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOutI> &samples,
+             const MatrixBase<DerivedOutV> &output) const
+  {
+    UNCONST(DerivedOutV, output, my_output);
+#pragma omp parallel for
+    for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
     {
-	    // W is vocab_size x output_embedding_dimension
-	    // input_bProp_matrix is vocab_size x minibatch_size
-	    // bProp_matrix is output_embedding_dimension x minibatch_size
-	    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
-	    my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
+      for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+      {
+        my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+      }
+    }
+    USCMatrix<double> sparse_output(W->rows(), samples, my_output);
+    uscgemm_masked(1.0, *W, input, sparse_output);
+    my_output = sparse_output.values; // too bad, so much copying
+  }
+
+  // Return single element of output matrix
+  template <typename DerivedIn>
+  double fProp(const MatrixBase<DerivedIn> &input,
+               int word,
+               int instance) const
+  {
+    return W->row(word).dot(input.col(instance)) + b(word);
+  }
+
+  // Dense versions (for log-likelihood loss)
+
+  template <typename DerivedGOut, typename DerivedGIn>
+  void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
+             const MatrixBase<DerivedGIn> &bProp_matrix) const
+  {
+    // W is vocab_size x output_embedding_dimension
+    // input_bProp_matrix is vocab_size x minibatch_size
+    // bProp_matrix is output_embedding_dimension x minibatch_size
+    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+    my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
         W->transpose() * input_bProp_matrix;
-	  }
+  }
 
-    template <typename DerivedIn, typename DerivedGOut>
-          void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
-             const MatrixBase<DerivedGOut> &bProp_input,
-             double learning_rate,
-             double momentum) //not sure if we want 	to use momentum here
-    {
-        // W is vocab_size x output_embedding_dimension
-        // b is vocab_size x 1
-        // predicted_embeddings is output_embedding_dimension x minibatch_size
-        // bProp_input is vocab_size x minibatch_size
-        W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
-        b += learning_rate * bProp_input.rowwise().sum();
-
-        /*
-        //GRADIENT CLIPPING
-        W->noalias() += learning_rate * 
-          ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
-        b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
-        //UPDATE CLIPPING
-        W->noalias() += (learning_rate * 
-        (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
-        b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
-        */
-	  }
-
-    template <typename DerivedIn, typename DerivedGOut>
-          void computeGradientAdagrad(
-             const MatrixBase<DerivedIn> &predicted_embeddings,
-             const MatrixBase<DerivedGOut> &bProp_input,
-             double learning_rate) //not sure if we want to use momentum here
-    {
-        // W is vocab_size x output_embedding_dimension
-        // b is vocab_size x 1
-        // predicted_embeddings is output_embedding_dimension x minibatch_size
-        // bProp_input is vocab_size x minibatch_sizea
-        W_gradient.setZero(W->rows(), W->cols());
-        b_gradient.setZero(b.size());
-        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
-        b_gradient.noalias() = bProp_input.rowwise().sum();
-        W_running_gradient += W_gradient.array().square().matrix();
-        b_running_gradient += b_gradient.array().square().matrix();
-        W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
-        b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
-        /*
-        //UPDATE CLIPPING
-        *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
-        b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
-        */
-	  }
-
-    template <typename DerivedIn, typename DerivedGOut>
-          void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
-             const MatrixBase<DerivedGOut> &bProp_input,
-             double learning_rate,
-             double conditioning_constant,
-             double decay) //not sure if we want to use momentum here
-    {
-        // W is vocab_size x output_embedding_dimension
-        // b is vocab_size x 1
-        // predicted_embeddings is output_embedding_dimension x minibatch_size
-        // bProp_input is vocab_size x minibatch_size
-        Array<double,Dynamic,Dynamic> W_current_parameter_update;
-        Array<double,Dynamic,1> b_current_parameter_update;
-        W_gradient.setZero(W->rows(), W->cols());
-        b_gradient.setZero(b.size());
-        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
-        b_gradient.noalias() = bProp_input.rowwise().sum();
-        W_running_gradient = decay*W_running_gradient +
-                            (1.-decay)*W_gradient.array().square().matrix();
-        b_running_gradient = decay*b_running_gradient+
-                            (1.-decay)*b_gradient.array().square().matrix();
-        W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
-                                     (W_running_gradient.array()+conditioning_constant).sqrt())*
-                                      W_gradient.array();
-        b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
-                                     (b_running_gradient.array()+conditioning_constant).sqrt())*
-                                     b_gradient.array();
-        W_running_parameter_update = decay*W_running_parameter_update + 
-                                    (1.-decay)*W_current_parameter_update.square().matrix();
-        b_running_parameter_update = decay*b_running_parameter_update +
-                                    (1.-decay)*b_current_parameter_update.square().matrix();
-
-        *W += learning_rate*W_current_parameter_update.matrix();
-        b += learning_rate*b_current_parameter_update.matrix();
-	  }
-
-    // Sparse versions
-
-    template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
-    void bProp(const MatrixBase<DerivedGOutI> &samples,
-    const MatrixBase<DerivedGOutV> &weights,
-    const MatrixBase<DerivedGIn> &bProp_matrix) const
-    {
-        UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
-        my_bProp_matrix.setZero();
-        uscgemm(1.0,
-            W->transpose(), 
+  template <typename DerivedIn, typename DerivedGOut>
+  void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+                       const MatrixBase<DerivedGOut> &bProp_input,
+                       double learning_rate,
+                       double momentum) //not sure if we want   to use momentum here
+  {
+    // W is vocab_size x output_embedding_dimension
+    // b is vocab_size x 1
+    // predicted_embeddings is output_embedding_dimension x minibatch_size
+    // bProp_input is vocab_size x minibatch_size
+    W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
+    b += learning_rate * bProp_input.rowwise().sum();
+
+    /*
+    //GRADIENT CLIPPING
+    W->noalias() += learning_rate *
+    ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+    b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+    //UPDATE CLIPPING
+    W->noalias() += (learning_rate *
+    (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+    b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+    */
+  }
+
+  template <typename DerivedIn, typename DerivedGOut>
+  void computeGradientAdagrad(
+      const MatrixBase<DerivedIn> &predicted_embeddings,
+      const MatrixBase<DerivedGOut> &bProp_input,
+      double learning_rate) //not sure if we want to use momentum here
+  {
+    // W is vocab_size x output_embedding_dimension
+    // b is vocab_size x 1
+    // predicted_embeddings is output_embedding_dimension x minibatch_size
+    // bProp_input is vocab_size x minibatch_sizea
+    W_gradient.setZero(W->rows(), W->cols());
+    b_gradient.setZero(b.size());
+    W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
+    W_running_gradient += W_gradient.array().square().matrix();
+    b_running_gradient += b_gradient.array().square().matrix();
+    W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+    b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+    /*
+    //UPDATE CLIPPING
+    *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+    b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+    */
+  }
+
+  template <typename DerivedIn, typename DerivedGOut>
+  void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+                               const MatrixBase<DerivedGOut> &bProp_input,
+                               double learning_rate,
+                               double conditioning_constant,
+                               double decay) //not sure if we want to use momentum here
+  {
+    // W is vocab_size x output_embedding_dimension
+    // b is vocab_size x 1
+    // predicted_embeddings is output_embedding_dimension x minibatch_size
+    // bProp_input is vocab_size x minibatch_size
+    Array<double,Dynamic,Dynamic> W_current_parameter_update;
+    Array<double,Dynamic,1> b_current_parameter_update;
+    W_gradient.setZero(W->rows(), W->cols());
+    b_gradient.setZero(b.size());
+    W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
+    W_running_gradient = decay*W_running_gradient +
+        (1.-decay)*W_gradient.array().square().matrix();
+    b_running_gradient = decay*b_running_gradient+
+        (1.-decay)*b_gradient.array().square().matrix();
+    W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                  (W_running_gradient.array()+conditioning_constant).sqrt())*
+        W_gradient.array();
+    b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                  (b_running_gradient.array()+conditioning_constant).sqrt())*
+        b_gradient.array();
+    W_running_parameter_update = decay*W_running_parameter_update +
+        (1.-decay)*W_current_parameter_update.square().matrix();
+    b_running_parameter_update = decay*b_running_parameter_update +
+        (1.-decay)*b_current_parameter_update.square().matrix();
+
+    *W += learning_rate*W_current_parameter_update.matrix();
+    b += learning_rate*b_current_parameter_update.matrix();
+  }
+
+  // Sparse versions
+
+  template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
+  void bProp(const MatrixBase<DerivedGOutI> &samples,
+             const MatrixBase<DerivedGOutV> &weights,
+             const MatrixBase<DerivedGIn> &bProp_matrix) const
+  {
+    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+    my_bProp_matrix.setZero();
+    uscgemm(1.0,
+            W->transpose(),
             USCMatrix<double>(W->rows(), samples, weights),
             my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
+  }
+
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+  void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+                       const MatrixBase<DerivedGOutI> &samples,
+                       const MatrixBase<DerivedGOutV> &weights,
+                       double learning_rate, double momentum) //not sure if we want to use momentum here
+  {
+    //cerr<<"in gradient"<<endl;
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(learning_rate,
+            gradient_output,
+            predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
+            *W); // narrow predicted_embeddings for possible short minibatch
+    uscgemv(learning_rate,
+            gradient_output,
+            Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+            b);
+    /*
+    //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+    //FIRST
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+    gradient_output,
+    predicted_embeddings.leftCols(samples.cols()).transpose(),
+    W_gradient);
+    uscgemv(1.0,
+    gradient_output,
+    Matrix<double,Dynamic,1>::Ones(weights.cols()),
+    b_gradient);
+
+    int_map update_map; //stores all the parameters that have been updated
+    for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+    for (int train_id=0; train_id<samples.cols(); train_id++)
+    update_map[samples(sample_id, train_id)] = 1;
+
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+    update_items.push_back(it->first);
+    int num_items = update_items.size();
+
+    //#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+    int update_item = update_items[item_id];
+    //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+    //b(update_item) += learning_rate * b_gradient(update_item);
+    //UPDATE CLIPPING
+    W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+    double update = learning_rate * b_gradient(update_item);
+    b(update_item) += std::min(0.5, std::max(update,-0.5));
+    //GRADIENT CLIPPING
+    W_gradient.row(update_item).setZero();
+    b_gradient(update_item) = 0.;
     }
+    */
+    //cerr<<"Finished gradient"<<endl;
+  }
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
-        void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
-			     const MatrixBase<DerivedGOutI> &samples,
-			     const MatrixBase<DerivedGOutV> &weights,
-			     double learning_rate, double momentum) //not sure if we want to use momentum here
-	{
-      //cerr<<"in gradient"<<endl;
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(learning_rate,
-          gradient_output,
-          predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
-          *W); // narrow predicted_embeddings for possible short minibatch
-	    uscgemv(learning_rate,
-          gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
-          b);
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+  void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
+                              const MatrixBase<DerivedGOutI> &samples,
+                              const MatrixBase<DerivedGOutV> &weights,
+                              double learning_rate) //not sure if we want to use momentum here
+  {
+    //W_gradient.setZero(W->rows(), W->cols());
+    //b_gradient.setZero(b.size());
+    //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+            gradient_output,
+            predicted_embeddings.leftCols(samples.cols()).transpose(),
+            W_gradient);
+    uscgemv(1.0,
+            gradient_output,
+            Matrix<double,Dynamic,1>::Ones(weights.cols()),
+            b_gradient);
+
+    int_map update_map; //stores all the parameters that have been updated
+    for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+      for (int train_id=0; train_id<samples.cols(); train_id++)
+        update_map[samples(sample_id, train_id)] = 1;
+
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+      update_items.push_back(it->first);
+    int num_items = update_items.size();
+
+    //#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+      b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
+      W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+      b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
       /*
-      //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
-      //FIRST
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          W_gradient);
-	    uscgemv(1.0, 
-          gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
-          b_gradient);
-
-      int_map update_map; //stores all the parameters that have been updated
-      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-	        for (int train_id=0; train_id<samples.cols(); train_id++)
-		          update_map[samples(sample_id, train_id)] = 1;
-
-	    // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            update_items.push_back(it->first);
-        int num_items = update_items.size();
-
-        //#pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            //W->row(update_item) += learning_rate * W_gradient.row(update_item);
-            //b(update_item) += learning_rate * b_gradient(update_item);
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
-            double update = learning_rate * b_gradient(update_item);
-            b(update_item) += std::min(0.5, std::max(update,-0.5));
-            //GRADIENT CLIPPING
-            W_gradient.row(update_item).setZero();
-            b_gradient(update_item) = 0.;
-        }
-        */
-      //cerr<<"Finished gradient"<<endl;
-	}
-
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
-        void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
-				    const MatrixBase<DerivedGOutI> &samples,
-				    const MatrixBase<DerivedGOutV> &weights,
-				    double learning_rate) //not sure if we want to use momentum here
-        {
-	    //W_gradient.setZero(W->rows(), W->cols());
-	    //b_gradient.setZero(b.size());
-      //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          W_gradient);
-	    uscgemv(1.0, 
-          gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
-          b_gradient);
-
-      int_map update_map; //stores all the parameters that have been updated
-      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-	        for (int train_id=0; train_id<samples.cols(); train_id++)
-		          update_map[samples(sample_id, train_id)] = 1;
-
-	    // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            update_items.push_back(it->first);
-        int num_items = update_items.size();
-
-        //#pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
-            b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
-            W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
-            b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
-            /*
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
-            double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
-            b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
-            */
-            W_gradient.row(update_item).setZero();
-            b_gradient(update_item) = 0.;
-        }
-      }
+      //UPDATE CLIPPING
+      W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+      double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+      b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+      */
+      W_gradient.row(update_item).setZero();
+      b_gradient(update_item) = 0.;
+    }
+  }
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
-        void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
-				    const MatrixBase<DerivedGOutI> &samples,
-				    const MatrixBase<DerivedGOutV> &weights,
-				    double learning_rate,
-            double conditioning_constant,
-            double decay) //not sure if we want to use momentum here
-        {
-          //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
-	    //W_gradient.setZero(W->rows(), W->cols());
-	    //b_gradient.setZero(b.size());
-
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          W_gradient);
-	    uscgemv(1.0, 
-          gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
-          b_gradient);
-
-      int_map update_map; //stores all the parameters that have been updated
-      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-	        for (int train_id=0; train_id<samples.cols(); train_id++)
-		          update_map[samples(sample_id, train_id)] = 1;
-
-	    // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            update_items.push_back(it->first);
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            Array<double,1,Dynamic> W_current_parameter_update;
-            double b_current_parameter_update;
-
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
-                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
-            b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
-                                            (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
-            //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
-            //getchar();
-
-            //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
-            //getchar();
-            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
-                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
-                                         W_gradient.row(update_item).array();
-            b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
-                                         sqrt(b_running_gradient(update_item)+conditioning_constant))*
-                                         b_gradient(update_item);
-            //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
-            //getchar();
-            //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
-            //getchar();
-            //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
-            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
-                                                         (1.-decay)*(W_current_parameter_update.square().matrix());
-            b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
-                                                      (1.-decay)*b_current_parameter_update*b_current_parameter_update;
-            //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
-            //getchar();
-            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
-            b(update_item) += learning_rate*b_current_parameter_update;
-            W_gradient.row(update_item).setZero();
-            b_gradient(update_item) = 0.;
-        }
-      }
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+  void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+                               const MatrixBase<DerivedGOutI> &samples,
+                               const MatrixBase<DerivedGOutV> &weights,
+                               double learning_rate,
+                               double conditioning_constant,
+                               double decay) //not sure if we want to use momentum here
+  {
+    //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+    //W_gradient.setZero(W->rows(), W->cols());
+    //b_gradient.setZero(b.size());
+
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+            gradient_output,
+            predicted_embeddings.leftCols(samples.cols()).transpose(),
+            W_gradient);
+    uscgemv(1.0,
+            gradient_output,
+            Matrix<double,Dynamic,1>::Ones(weights.cols()),
+            b_gradient);
+
+    int_map update_map; //stores all the parameters that have been updated
+    for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+      for (int train_id=0; train_id<samples.cols(); train_id++)
+        update_map[samples(sample_id, train_id)] = 1;
+
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+      update_items.push_back(it->first);
+    int num_items = update_items.size();
+
+#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+      Array<double,1,Dynamic> W_current_parameter_update;
+      double b_current_parameter_update;
+
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+          (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+      b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+          (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+      //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+      //getchar();
+
+      //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+      //getchar();
+      W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                    (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+          W_gradient.row(update_item).array();
+      b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+                                    sqrt(b_running_gradient(update_item)+conditioning_constant))*
+          b_gradient(update_item);
+      //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+      //getchar();
+      //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+      //getchar();
+      //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+      W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+          (1.-decay)*(W_current_parameter_update.square().matrix());
+      b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+          (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+      //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+      //getchar();
+      W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+      b(update_item) += learning_rate*b_current_parameter_update;
+      W_gradient.row(update_item).setZero();
+      b_gradient(update_item) = 0.;
+    }
+  }
 
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
-    void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
-      const MatrixBase<DerivedGOutI> &samples,
-      const MatrixBase<DerivedGOutV> &weights,
-      const MatrixBase<DerivedGW> &gradient_W,
-      const MatrixBase<DerivedGb> &gradient_b) const
-  {
-	    UNCONST(DerivedGW, gradient_W, my_gradient_W);
-	    UNCONST(DerivedGb, gradient_b, my_gradient_b);
-	    my_gradient_W.setZero();
-	    my_gradient_b.setZero();
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          my_gradient_W);
-	    uscgemv(1.0, gradient_output,
-		    Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
+  void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
+                            const MatrixBase<DerivedGOutI> &samples,
+                            const MatrixBase<DerivedGOutV> &weights,
+                            const MatrixBase<DerivedGW> &gradient_W,
+                            const MatrixBase<DerivedGb> &gradient_b) const
+  {
+    UNCONST(DerivedGW, gradient_W, my_gradient_W);
+    UNCONST(DerivedGb, gradient_b, my_gradient_b);
+    my_gradient_W.setZero();
+    my_gradient_b.setZero();
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+            gradient_output,
+            predicted_embeddings.leftCols(samples.cols()).transpose(),
+            my_gradient_W);
+    uscgemv(1.0, gradient_output,
+            Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
   }
 };
 
 class Input_word_embeddings
 {
-    private:
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
-        int context_size, vocab_size;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
-
-	friend class model;
-
-    public:
-        Input_word_embeddings() : context_size(0), vocab_size(0) { }
-        Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
- 
-      void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
-        W = input_W;
-      }
+ private:
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+  int context_size, vocab_size;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
 
-      void resize(int rows, int cols, int context)
-      {
-        context_size = context;
-        vocab_size = rows;
-        W->setZero(rows, cols);
-      }
+  friend class model;
 
-        void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
-        void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+ public:
+  Input_word_embeddings() : context_size(0), vocab_size(0) { }
+  Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
 
-      template <typename Engine>
-      void initialize(Engine &engine,
-          bool init_normal,
-          double init_range,
-          string &parameter_update,
-          double adagrad_epsilon)
-      {
-          W_gradient.setZero(W->rows(),W->cols());
-
-          if (parameter_update == "ADA") {
-            W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
-            //W_gradient.setZero(W->rows(),W->cols());
-          } 
-        if (parameter_update == "ADAD") {
-          W_running_gradient.setZero(W->rows(),W->cols());
-          //W_gradient.setZero(W->rows(),W->cols());
-          W_running_parameter_update.setZero(W->rows(),W->cols());
-        }
-        initMatrix(engine,
-            *W,
-            init_normal,
-            init_range);
-      }
+  void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+    W = input_W;
+  }
 
-	int n_inputs() const { return -1; }
-	int n_outputs() const { return W->cols() * context_size; }
-
-	// set output_id's embedding to the weighted average of all embeddings
-	template <typename Dist>
-	void average(const Dist &dist, int output_id)
-	{
-	    W->row(output_id).setZero();
-	    for (int i=0; i < W->rows(); i++)
-	        if (i != output_id)
-		    W->row(output_id) += dist.prob(i) * W->row(i);
-	}
-
-	template <typename DerivedIn, typename DerivedOut>
-        void fProp(const MatrixBase<DerivedIn> &input,
-		   const MatrixBase<DerivedOut> &output) const
-        {
-            int embedding_dimension = W->cols();
-
-	    // W      is vocab_size                        x embedding_dimension
-	    // input  is ngram_size*vocab_size             x minibatch_size
-	    // output is ngram_size*embedding_dimension x minibatch_size
-
-	    /* 
-	    // Dense version:
-	    for (int ngram=0; ngram<context_size; ngram++)
-	        output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
-	    */
-
-	    UNCONST(DerivedOut, output, my_output);
-	    my_output.setZero();
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        // input might be narrower than expected due to a short minibatch,
-	        // so narrow output to match
-	        uscgemm(1.0,
-            W->transpose(), 
-            USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
-            my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
-	    }
-        }
-
-	// When model is premultiplied, this layer doesn't get used,
-	// but this method is used to get the input into a sparse matrix.
-	// Hopefully this can get eliminated someday
-	template <typename DerivedIn, typename ScalarOut>
-	void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
-	{
-	  output.resize(vocab_size*context_size, context_size, input.cols());
-	  for (int i=0; i < context_size; i++)
-	    output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
-	  output.values.fill(1.0);
-	}
+  void resize(int rows, int cols, int context)
+  {
+    context_size = context;
+    vocab_size = rows;
+    W->setZero(rows, cols);
+  }
+
+  void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
+  void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+
+  template <typename Engine>
+  void initialize(Engine &engine,
+                  bool init_normal,
+                  double init_range,
+                  string &parameter_update,
+                  double adagrad_epsilon)
+  {
+    W_gradient.setZero(W->rows(),W->cols());
+
+    if (parameter_update == "ADA") {
+      W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+      //W_gradient.setZero(W->rows(),W->cols());
+    }
+    if (parameter_update == "ADAD") {
+      W_running_gradient.setZero(W->rows(),W->cols());
+      //W_gradient.setZero(W->rows(),W->cols());
+      W_running_parameter_update.setZero(W->rows(),W->cols());
+    }
+    initMatrix(engine,
+               *W,
+               init_normal,
+               init_range);
+  }
+
+  int n_inputs() const { return -1; }
+  int n_outputs() const { return W->cols() * context_size; }
+
+  // set output_id's embedding to the weighted average of all embeddings
+  template <typename Dist>
+  void average(const Dist &dist, int output_id)
+  {
+    W->row(output_id).setZero();
+    for (int i=0; i < W->rows(); i++)
+      if (i != output_id)
+        W->row(output_id) += dist.prob(i) * W->row(i);
+  }
+
+  template <typename DerivedIn, typename DerivedOut>
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOut> &output) const
+  {
+    int embedding_dimension = W->cols();
+
+    // W      is vocab_size                        x embedding_dimension
+    // input  is ngram_size*vocab_size             x minibatch_size
+    // output is ngram_size*embedding_dimension x minibatch_size
+
+    /*
+    // Dense version:
+    for (int ngram=0; ngram<context_size; ngram++)
+    output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
+    */
+
+    UNCONST(DerivedOut, output, my_output);
+    my_output.setZero();
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      // input might be narrower than expected due to a short minibatch,
+      // so narrow output to match
+      uscgemm(1.0,
+              W->transpose(),
+              USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
+              my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
+    }
+  }
+
+  // When model is premultiplied, this layer doesn't get used,
+  // but this method is used to get the input into a sparse matrix.
+  // Hopefully this can get eliminated someday
+  template <typename DerivedIn, typename ScalarOut>
+  void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
+  {
+    output.resize(vocab_size*context_size, context_size, input.cols());
+    for (int i=0; i < context_size; i++)
+      output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
+    output.values.fill(1.0);
+  }
 
   template <typename DerivedGOut, typename DerivedIn>
   void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
-     const MatrixBase<DerivedIn> &input_words,
-     double learning_rate, double momentum, double L2_reg)
+                       const MatrixBase<DerivedIn> &input_words,
+                       double learning_rate, double momentum, double L2_reg)
   {
-      int embedding_dimension = W->cols();
+    int embedding_dimension = W->cols();
 
-	    // W           is vocab_size                        x embedding_dimension
-	    // input       is ngram_size*vocab_size             x minibatch_size
-	    // bProp_input is ngram_size*embedding_dimension x minibatch_size
+    // W           is vocab_size                        x embedding_dimension
+    // input       is ngram_size*vocab_size             x minibatch_size
+    // bProp_input is ngram_size*embedding_dimension x minibatch_size
 
-	    /*
-	    // Dense version:
-	    for (int ngram=0; ngram<context_size; ngram++)
-	        W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
-	    */
+    /*
+    // Dense version:
+    for (int ngram=0; ngram<context_size; ngram++)
+    W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
+    */
 
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        uscgemm(learning_rate, 
-			USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
-      	  	*W);
-	    }
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      uscgemm(learning_rate,
+              USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
+              *W);
+    }
 
-      /*
-      //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
-      //PERFORM CLIPPING WHILE UPDATING
-
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	      uscgemm(1.0, 
-          USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-          bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-          W_gradient);
-	    }
-      int_map update_map; //stores all the parameters that have been updated
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-        for (int train_id=0; train_id<input_words.cols(); train_id++)
-        {
-          update_map[input_words(ngram,train_id)] = 1;
-        }
-      }
+    /*
+    //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+    //PERFORM CLIPPING WHILE UPDATING
 
-	    // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-        {
-            update_items.push_back(it->first);
-        }
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate*
-                W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
-            //GRADIENT CLIPPING
-            //W->row(update_item) += learning_rate*
-            //    W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
-            //SETTING THE GRADIENT TO ZERO
-            W_gradient.row(update_item).setZero();
-        }
-      */
-  }
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+    uscgemm(1.0,
+    USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+    bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+    W_gradient);
+    }
+    int_map update_map; //stores all the parameters that have been updated
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+    for (int train_id=0; train_id<input_words.cols(); train_id++)
+    {
+    update_map[input_words(ngram,train_id)] = 1;
+    }
+    }
 
-    template <typename DerivedGOut, typename DerivedIn>
-    void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
-				    const MatrixBase<DerivedIn> &input_words,
-				    double learning_rate,
-            double L2_reg)
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
     {
-            int embedding_dimension = W->cols();
-	    //W_gradient.setZero(W->rows(), W->cols());
-      /*
+    update_items.push_back(it->first);
+    }
+    int num_items = update_items.size();
+
+    #pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+    int update_item = update_items[item_id];
+    //UPDATE CLIPPING
+    W->row(update_item) += (learning_rate*
+    W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+    //GRADIENT CLIPPING
+    //W->row(update_item) += learning_rate*
+    //    W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+    //SETTING THE GRADIENT TO ZERO
+    W_gradient.row(update_item).setZero();
+    }
+    */
+  }
+
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+                              const MatrixBase<DerivedIn> &input_words,
+                              double learning_rate,
+                              double L2_reg)
+  {
+    int embedding_dimension = W->cols();
+    //W_gradient.setZero(W->rows(), W->cols());
+    /*
       if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
-      */
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        uscgemm(1.0, 
-			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-      W_gradient);
-	    }
-      int_map update_map; //stores all the parameters that have been updated
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-        for (int train_id=0; train_id<input_words.cols(); train_id++)
-        {
-          update_map[input_words(ngram,train_id)] = 1;
-        }
+      W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+    */
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      uscgemm(1.0,
+              USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+              W_gradient);
+    }
+    int_map update_map; //stores all the parameters that have been updated
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      for (int train_id=0; train_id<input_words.cols(); train_id++)
+      {
+        update_map[input_words(ngram,train_id)] = 1;
       }
+    }
 
-	    // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-        {
-            update_items.push_back(it->first);
-        }
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
-            W->row(update_item) += learning_rate * 
-              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
-            /*
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate * 
-              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
-                      .unaryExpr(Clipper()).matrix();
-            */
-            W_gradient.row(update_item).setZero();
-        }
-    }
-
-    template <typename DerivedGOut, typename DerivedIn>
-    void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
-				    const MatrixBase<DerivedIn> &input_words,
-				    double learning_rate,
-            double L2_reg,
-            double conditioning_constant,
-            double decay)
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
     {
-      int embedding_dimension = W->cols();
+      update_items.push_back(it->first);
+    }
+    int num_items = update_items.size();
 
-	    //W_gradient.setZero(W->rows(), W->cols());
+#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+      W->row(update_item) += learning_rate *
+          (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
       /*
-      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+      //UPDATE CLIPPING
+      W->row(update_item) += (learning_rate *
+      (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+      .unaryExpr(Clipper()).matrix();
       */
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        uscgemm(1.0, 
-			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-      W_gradient);
-	    }
-      int_map update_map; //stores all the parameters that have been updated
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-        for (int train_id=0; train_id<input_words.cols(); train_id++)
-        {
-          update_map[input_words(ngram,train_id)] = 1;
-        }
+      W_gradient.row(update_item).setZero();
+    }
+  }
+
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+                               const MatrixBase<DerivedIn> &input_words,
+                               double learning_rate,
+                               double L2_reg,
+                               double conditioning_constant,
+                               double decay)
+  {
+    int embedding_dimension = W->cols();
+
+    //W_gradient.setZero(W->rows(), W->cols());
+    /*
+      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+      W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+    */
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      uscgemm(1.0,
+              USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+              W_gradient);
+    }
+    int_map update_map; //stores all the parameters that have been updated
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      for (int train_id=0; train_id<input_words.cols(); train_id++)
+      {
+        update_map[input_words(ngram,train_id)] = 1;
       }
+    }
 
-	    // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-        {
-            update_items.push_back(it->first);
-        }
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-
-            Array<double,1,Dynamic> W_current_parameter_update;
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
-                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
-
-            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
-                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
-                                         W_gradient.row(update_item).array();
-
-            //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
-            //getchar();
-            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
-                                                         (1.-decay)*W_current_parameter_update.square().matrix();
-
-            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
-            //cerr<<"Input: After update, W is  "<<W->row(update_item)<<endl;
-            //getchar();
-            W_gradient.row(update_item).setZero();
-        }
-
-    }
-
-    template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-    void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
-      const MatrixBase<DerivedIn> &input_words,
-      int x, int minibatch_size,
-      const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
     {
-	    UNCONST(DerivedGW, gradient, my_gradient);
-            int embedding_dimension = W->cols();
-	    my_gradient.setZero();
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    uscgemm(1.0, 
-			  USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			  bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-        my_gradient);
+      update_items.push_back(it->first);
     }
+    int num_items = update_items.size();
+
+#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+
+      Array<double,1,Dynamic> W_current_parameter_update;
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+          (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+      W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                    (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+          W_gradient.row(update_item).array();
+
+      //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+      //getchar();
+      W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+          (1.-decay)*W_current_parameter_update.square().matrix();
+
+      W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+      //cerr<<"Input: After update, W is  "<<W->row(update_item)<<endl;
+      //getchar();
+      W_gradient.row(update_item).setZero();
+    }
+
+  }
+
+  template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+                            const MatrixBase<DerivedIn> &input_words,
+                            int x, int minibatch_size,
+                            const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+  {
+    UNCONST(DerivedGW, gradient, my_gradient);
+    int embedding_dimension = W->cols();
+    my_gradient.setZero();
+    for (int ngram=0; ngram<context_size; ngram++)
+      uscgemm(1.0,
+              USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+              my_gradient);
+  }
 };
 
 } // namespace nplm
-
diff --git a/src/neuralLM.h b/src/neuralLM.h
index 2004596..f0eebd8 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -6,119 +6,138 @@
 #include <cstdlib>
 #include <boost/shared_ptr.hpp>
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "util.h"
 #include "vocabulary.h"
 #include "neuralNetwork.h"
+#include "replace_digits.hpp"
 
 /*
   To do:
   - move digit mapping into vocabulary.h
- */
+*/
 
 namespace nplm
 {
 
-class neuralLM : public neuralNetwork
+class neuralLM : public neuralNetwork, graehl::replace_digits
 {
-    char map_digits;
-    boost::shared_ptr<vocabulary> vocab;
-    int start, null;
+  boost::shared_ptr<vocabulary> vocab;
+  int start, null;
 
-public:
-    neuralLM() 
+ public:
+  neuralLM()
       : neuralNetwork(),
-        vocab(new vocabulary()),
-	map_digits(0)
-    { 
-    }
+        graehl::replace_digits(0),
+        vocab(new vocabulary())
+  {
+  }
 
-    void set_map_digits(char value) { map_digits = value; }
+  void set_map_digits(char value) { map_digits = value; }
 
-    void set_vocabulary(const vocabulary &vocab)
-    {
-        *(this->vocab) = vocab;
-        start = vocab.lookup_word("<s>");
-        null = vocab.lookup_word("<null>");
-    }
+  void set_vocabulary(const vocabulary &vocab)
+  {
+    *(this->vocab) = vocab;
+    start = vocab.lookup_word("<s>");
+    null = vocab.lookup_word("<null>");
+  }
 
-    const vocabulary &get_vocabulary() const { return *(this->vocab); }
+  const vocabulary &get_vocabulary() const { return *(this->vocab); }
 
-    int lookup_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return vocab->lookup_word(mapped_word);
-		}
-        return vocab->lookup_word(word);
-    }
+  int lookup_input_word(const std::string &word) const
+  {
+    return lookup_word(word);
+  }
 
-    double lookup_ngram(const int *ngram_a, int n)
-    {
-        Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
-	for (int i=0; i<m->ngram_size; i++)
-	{
-	    if (i-m->ngram_size+n < 0)
-	    {
-		if (ngram_a[0] == start)
-		    ngram(i) = start;
-		else
-		    ngram(i) = null;
-	    }
-	    else
-	    {
-	        ngram(i) = ngram_a[i-m->ngram_size+n];
-	    }
-	}
-	return neuralNetwork::lookup_ngram(ngram);
-    }
+  int lookup_input_word(std::pair<char const*, char const*> word) const
+  {
+    return lookup_word(word);
+  }
 
-    double lookup_ngram(const std::vector<int> &ngram_v)
-    {
-        return lookup_ngram(ngram_v.data(), ngram_v.size());
-    }
 
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
-    {
-        return neuralNetwork::lookup_ngram(ngram);
-    }
-    
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        return neuralNetwork::lookup_ngram(ngram, log_probs_const);
-    }
+  int lookup_word(const std::string &word) const
+  {
+    if (map_digits)
+      for (int i=0, n=word.size(); i<n; ++i)
+        if (graehl::ascii_digit(word[i])) {
+          std::string mapped_word(word);
+          replace(mapped_word, i);
+          return vocab->lookup_word(mapped_word);
+        }
+    return vocab->lookup_word(word);
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice) const
+  {
+    if (map_digits)
+      for (char const* i = slice.first; i != slice.second; ++i)
+        if (graehl::ascii_digit(*i)) {
+          std::string mapped_word(slice.first, slice.second);
+          replace(mapped_word, i - slice.first);
+          return vocab->lookup_word(mapped_word);
+        }
+    return vocab->lookup_word(slice);
+  }
 
-    void read(const std::string &filename)
+  double lookup_ngram(const int *ngram_a, int n)
+  {
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+    for (int i=0; i<m->ngram_size; ++i)
     {
-        std::vector<std::string> words;
-        m->read(filename, words);
-        set_vocabulary(vocabulary(words));
-        resize();
-	// this is faster but takes more memory
-        //m->premultiply();
+      if (i-m->ngram_size+n < 0)
+      {
+        if (ngram_a[0] == start)
+          ngram(i) = start;
+        else
+          ngram(i) = null;
+      }
+      else
+      {
+        ngram(i) = ngram_a[i-m->ngram_size+n];
+      }
     }
+    return neuralNetwork::lookup_ngram(ngram);
+  }
+
+  double lookup_ngram(const std::vector<int> &ngram_v)
+  {
+    return lookup_ngram(ngram_v.data(), ngram_v.size());
+  }
+
+  template <typename Derived>
+  double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+  {
+    return neuralNetwork::lookup_ngram(ngram);
+  }
+
+  template <typename DerivedA, typename DerivedB>
+  void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+  {
+    return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+  }
+
+  void read(const std::string &filename)
+  {
+    std::vector<std::string> words;
+    m->read(filename, words);
+    set_vocabulary(vocabulary(words));
+    resize();
+    // this is faster but takes more memory
+    //m->premultiply();
+  }
 
 };
 
 template <typename T>
 void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop)
 {
-    output.clear();
-    output.resize(input.size()+ngram_size);
-    for (int i=0; i<ngram_size-1; i++)
-        output[i] = start;
-    std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
-    output[output.size()-1] = stop;
+  output.clear();
+  output.resize(input.size()+ngram_size);
+  for (int i=0; i<ngram_size-1; ++i)
+    output[i] = start;
+  std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
+  output[output.size()-1] = stop;
 }
 
 template <typename T>
@@ -127,21 +146,21 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu
   output.clear();
   for (int j=ngram_size-1; j<input.size(); j++)
   {
-      std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
-      output.push_back(ngram);
+    std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
+    output.push_back(ngram);
   }
 }
 
-inline void preprocessWords(const std::vector<std::string> &words, 
-    std::vector< std::vector<int> > &ngrams,
-	  int ngram_size, 
-    const vocabulary &vocab, 
-	  bool numberize,
-    bool add_start_stop,
-    bool ngramize) {
+inline void preprocessWords(const std::vector<std::string> &words,
+                            std::vector< std::vector<int> > &ngrams,
+                            int ngram_size,
+                            const vocabulary &vocab,
+                            bool numberize,
+                            bool add_start_stop,
+                            bool ngramize) {
   int start = vocab.lookup_word("<s>");
   int stop = vocab.lookup_word("</s>");
-  
+
   // convert words to ints
   std::vector<int> nums;
   if (numberize) {
@@ -152,9 +171,9 @@ inline void preprocessWords(const std::vector<std::string> &words,
   else {
     for (int j=0; j<words.size(); j++) {
       nums.push_back(boost::lexical_cast<int>(words[j]));
-    }            
+    }
   }
-  
+
   // convert sequence to n-grams
   ngrams.clear();
   if (ngramize) {
@@ -168,10 +187,10 @@ inline void preprocessWords(const std::vector<std::string> &words,
   }
   else {
     if (nums.size() != ngram_size)
-      {
-	std::cerr << "error: wrong number of fields in line" << std::endl;
-	std::exit(1);
-      }
+    {
+      std::cerr << "error: wrong number of fields in line\n";
+      std::exit(1);
+    }
     ngrams.push_back(nums);
   }
 }
diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h
index ef96488..6386a0f 100644
--- a/src/neuralNetwork.h
+++ b/src/neuralNetwork.h
@@ -3,7 +3,6 @@
 
 #include <vector>
 #include <boost/shared_ptr.hpp>
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "util.h"
@@ -16,191 +15,191 @@ namespace nplm
 
 class neuralNetwork
 {
-protected:
-    boost::shared_ptr<model> m;
+ protected:
+  boost::shared_ptr<model> m;
 
-private:
-    bool normalization;
-    double weight;
+ private:
+  bool normalization;
+  double weight;
 
-    propagator prop;
+  propagator prop;
 
-    std::size_t cache_size;
-    Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
-    std::vector<double> cache_values;
-    int cache_lookups, cache_hits;
+  std::size_t cache_size;
+  Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
+  std::vector<double> cache_values;
+  int cache_lookups, cache_hits;
 
-public:
-    neuralNetwork() 
+ public:
+  neuralNetwork()
       : m(new model()),
         normalization(false),
-	weight(1.),
-	prop(*m, 1),
+        weight(1.),
+        prop(*m, 1),
         cache_size(0)
-    { 
-    }
+  {
+  }
 
-    void set_normalization(bool value) { normalization = value; }
-    void set_log_base(double value) { weight = 1./std::log(value); }
-
-    // This must be called if the underlying model is resized.
-    void resize() {
-	if (cache_size)
-	{
-	  cache_keys.resize(m->ngram_size, cache_size);
-	  cache_keys.fill(-1);
-	}
-	prop.resize();
-    }
+  void set_normalization(bool value) { normalization = value; }
+  void set_log_base(double value) { weight = 1./std::log(value); }
 
-    void set_width(int width)
+  // This must be called if the underlying model is resized.
+  void resize() {
+    if (cache_size)
     {
-	prop.resize(width);
+      cache_keys.resize(m->ngram_size, cache_size);
+      cache_keys.fill(-1);
     }
-
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+    prop.resize();
+  }
+
+  void set_width(int width)
+  {
+    prop.resize(width);
+  }
+
+  template <typename Derived>
+  double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+  {
+    assert (ngram.rows() == m->ngram_size);
+    assert (ngram.cols() == 1);
+
+    std::size_t hash;
+    if (cache_size)
     {
-	assert (ngram.rows() == m->ngram_size);
-	assert (ngram.cols() == 1);
-
-	std::size_t hash;
-	if (cache_size)
-	{
-	    // First look in cache
-	    hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
-	    cache_lookups++;
-	    if (cache_keys.col(hash) == ngram)
-	    {
-	        cache_hits++;
-		return cache_values[hash];
-	    }
-	}
-
-	// Make sure that we're single threaded. Multithreading doesn't help,
-	// and in some cases can hurt quite a lot
-	int save_threads = omp_get_max_threads();
-	omp_set_num_threads(1);
-	int save_eigen_threads = Eigen::nbThreads();
-	Eigen::setNbThreads(1);
-	#ifdef __INTEL_MKL__
-	int save_mkl_threads = mkl_get_max_threads();
-	mkl_set_num_threads(1);
-	#endif
-
-        prop.fProp(ngram.col(0));
-
-	int output = ngram(m->ngram_size-1, 0);
-	double log_prob;
-
-	start_timer(3);
-	if (normalization)
-	{
-	    Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
-            if (prop.skip_hidden)
-                prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-            else
-                prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-	    double logz = logsum(scores.col(0));
-	    log_prob = weight * (scores(output, 0) - logz);
-	}
-	else
-	{
-            if (prop.skip_hidden)
-                log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
-            else
-                log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
-	}
-	stop_timer(3);
-
-	if (cache_size)
-	{
-	    // Update cache
-	    cache_keys.col(hash) = ngram;
-	    cache_values[hash] = log_prob;
-	}
-
-	#ifdef __INTEL_MKL__
-	mkl_set_num_threads(save_mkl_threads);
-	#endif
-	Eigen::setNbThreads(save_eigen_threads);
-	omp_set_num_threads(save_threads);
-
-	return log_prob;
+      // First look in cache
+      hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
+      cache_lookups++;
+      if (cache_keys.col(hash) == ngram)
+      {
+        cache_hits++;
+        return cache_values[hash];
+      }
     }
 
-    // Look up many n-grams in parallel.
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        UNCONST(DerivedB, log_probs_const, log_probs);
-	assert (ngram.rows() == m->ngram_size);
-	//assert (ngram.cols() <= prop.get_minibatch_size());
-
-        prop.fProp(ngram);
-
-	if (normalization)
-	{
-	    Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
-            if (prop.skip_hidden)
-                prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-            else
-                prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-
-	    // And softmax and loss
-	    Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
-	    double minibatch_log_likelihood;
-	    SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
-	    for (int j=0; j<ngram.cols(); j++)
-	    {
-	        int output = ngram(m->ngram_size-1, j);
-		log_probs(0, j) = weight * output_probs(output, j);
-	    }
-	}
-	else
-	{
-	    for (int j=0; j<ngram.cols(); j++)
-	    {
-	        int output = ngram(m->ngram_size-1, j);
-                if (prop.skip_hidden)
-                    log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
-                else
-                    log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
-	    }
-	}
-    }
+    // Make sure that we're single threaded. Multithreading doesn't help,
+    // and in some cases can hurt quite a lot
+    int save_threads = omp_get_max_threads();
+    omp_set_num_threads(1);
+    int save_eigen_threads = Eigen::nbThreads();
+    Eigen::setNbThreads(1);
+#ifdef __INTEL_MKL__
+    int save_mkl_threads = mkl_get_max_threads();
+    mkl_set_num_threads(1);
+#endif
+
+    prop.fProp(ngram.col(0));
 
-    int get_order() const { return m->ngram_size; }
+    int output = ngram(m->ngram_size-1, 0);
+    double log_prob;
 
-    void read(const std::string &filename)
+    start_timer(3);
+    if (normalization)
     {
-        m->read(filename);
-        resize();
-	// this is faster but takes more memory
-        //m->premultiply();
+      Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
+      if (prop.skip_hidden)
+        prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+      else
+        prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+      double logz = logsum(scores.col(0));
+      log_prob = weight * (scores(output, 0) - logz);
     }
-
-    void set_cache(std::size_t cache_size)
+    else
     {
-        this->cache_size = cache_size;
-	cache_keys.resize(m->ngram_size, cache_size);
-	cache_keys.fill(-1); // clears cache
-	cache_values.resize(cache_size);
-	cache_lookups = cache_hits = 0;
+      if (prop.skip_hidden)
+        log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
+      else
+        log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
     }
+    stop_timer(3);
 
-    double cache_hit_rate()
+    if (cache_size)
     {
-        return static_cast<double>(cache_hits)/cache_lookups;
+      // Update cache
+      cache_keys.col(hash) = ngram;
+      cache_values[hash] = log_prob;
     }
 
-    void premultiply()
+#ifdef __INTEL_MKL__
+    mkl_set_num_threads(save_mkl_threads);
+#endif
+    Eigen::setNbThreads(save_eigen_threads);
+    omp_set_num_threads(save_threads);
+
+    return log_prob;
+  }
+
+  // Look up many n-grams in parallel.
+  template <typename DerivedA, typename DerivedB>
+  void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+  {
+    UNCONST(DerivedB, log_probs_const, log_probs);
+    assert (ngram.rows() == m->ngram_size);
+    //assert (ngram.cols() <= prop.get_minibatch_size());
+
+    prop.fProp(ngram);
+
+    if (normalization)
+    {
+      Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
+      if (prop.skip_hidden)
+        prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+      else
+        prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
+      // And softmax and loss
+      Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
+      double minibatch_log_likelihood;
+      SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
+      for (int j=0; j<ngram.cols(); j++)
+      {
+        int output = ngram(m->ngram_size-1, j);
+        log_probs(0, j) = weight * output_probs(output, j);
+      }
+    }
+    else
+    {
+      for (int j=0; j<ngram.cols(); j++)
+      {
+        int output = ngram(m->ngram_size-1, j);
+        if (prop.skip_hidden)
+          log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
+        else
+          log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
+      }
+    }
+  }
+
+  int get_order() const { return m->ngram_size; }
+
+  void read(const std::string &filename)
+  {
+    m->read(filename);
+    resize();
+    // this is faster but takes more memory
+    //m->premultiply();
+  }
+
+  void set_cache(std::size_t cache_size)
+  {
+    this->cache_size = cache_size;
+    cache_keys.resize(m->ngram_size, cache_size);
+    cache_keys.fill(-1); // clears cache
+    cache_values.resize(cache_size);
+    cache_lookups = cache_hits = 0;
+  }
+
+  double cache_hit_rate()
+  {
+    return static_cast<double>(cache_hits)/cache_lookups;
+  }
+
+  void premultiply()
+  {
+    if (!m->premultiplied)
     {
-        if (!m->premultiplied)
-        {
-            m->premultiply();
-        }
+      m->premultiply();
     }
+  }
 
 };
 
diff --git a/src/neuralTM.h b/src/neuralTM.h
index 14bc7bf..9bb6d16 100644
--- a/src/neuralTM.h
+++ b/src/neuralTM.h
@@ -6,125 +6,139 @@
 #include <cstdlib>
 #include <boost/shared_ptr.hpp>
 
-#include <../3rdparty/Eigen/Dense>
+#include <Eigen/Dense>
 
 #include "util.h"
 #include "vocabulary.h"
 #include "neuralNetwork.h"
+#include "replace_digits.hpp"
 
 namespace nplm
 {
 
-class neuralTM : public neuralNetwork
+class neuralTM : public neuralNetwork, graehl::replace_digits
 {
-    char map_digits;
-    boost::shared_ptr<vocabulary> input_vocab, output_vocab;
-    int start, null;
+  boost::shared_ptr<vocabulary> input_vocab, output_vocab;
+  int start, null;
 
-public:
-    neuralTM() 
+ public:
+  neuralTM()
       : neuralNetwork(),
-        map_digits(0),
+        graehl::replace_digits(0),
         input_vocab(new vocabulary()),
         output_vocab(new vocabulary())
-    { 
-    }
-
-    void set_map_digits(char value) { map_digits = value; }
-
-    void set_input_vocabulary(const vocabulary &vocab)
-    {
-        *(this->input_vocab) = vocab;
-        start = vocab.lookup_word("<s>");
-        null = vocab.lookup_word("<null>");
-    }
-
-    void set_output_vocabulary(const vocabulary &vocab)
-    {
-        *(this->output_vocab) = vocab;
-    }
-
-    const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
-    const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
-
-    int lookup_input_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return input_vocab->lookup_word(mapped_word);
-		}
-        return input_vocab->lookup_word(word);
-    }
-
-    int lookup_output_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return output_vocab->lookup_word(mapped_word);
-		}
-	return output_vocab->lookup_word(word);
-    }
-
-    double lookup_ngram(const int *ngram_a, int n)
-    {
-        Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
-	for (int i=0; i<m->ngram_size; i++)
-	{
-	    if (i-m->ngram_size+n < 0)
-	    {
-		if (ngram_a[0] == start)
-		    ngram(i) = start;
-		else
-		    ngram(i) = null;
-	    }
-	    else
-	    {
-	        ngram(i) = ngram_a[i-m->ngram_size+n];
-	    }
-	}
-	return neuralNetwork::lookup_ngram(ngram);
-    }
-
-    double lookup_ngram(const std::vector<int> &ngram_v)
-    {
-        return lookup_ngram(ngram_v.data(), ngram_v.size());
-    }
-
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
-    {
-        return neuralNetwork::lookup_ngram(ngram);
-    }
-    
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        return neuralNetwork::lookup_ngram(ngram, log_probs_const);
-    }
-
-    void read(const std::string &filename)
+  {
+  }
+
+  void set_map_digits(char value) { map_digits = value; }
+
+  void set_input_vocabulary(const vocabulary &vocab)
+  {
+    *(this->input_vocab) = vocab;
+    start = vocab.lookup_word("<s>");
+    null = vocab.lookup_word("<null>");
+  }
+
+  void set_output_vocabulary(const vocabulary &vocab)
+  {
+    *(this->output_vocab) = vocab;
+  }
+
+  const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
+  const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
+
+  int lookup_word(const std::string &word, vocabulary const& vocab) const
+  {
+    if (map_digits)
+      for (int i=0, n=word.size(); i<n; ++i)
+        if (graehl::ascii_digit(word[i])) {
+          std::string mapped_word(word);
+          replace(mapped_word, i);
+          return vocab.lookup_word(mapped_word);
+        }
+    return vocab.lookup_word(word);
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice, vocabulary const& vocab) const
+  {
+    if (map_digits)
+      for (char const* i = slice.first; i != slice.second; ++i)
+        if (graehl::ascii_digit(*i)) {
+          std::string mapped_word(slice.first, slice.second);
+          replace(mapped_word, i - slice.first);
+          return vocab.lookup_word(mapped_word);
+        }
+    return vocab.lookup_word(slice);
+  }
+
+  int lookup_input_word(const std::string &word) const
+  {
+    return lookup_word(word, *input_vocab);
+  }
+
+  int lookup_output_word(const std::string &word) const
+  {
+    return lookup_word(word, *output_vocab);
+  }
+
+  int lookup_input_word(std::pair<char const*, char const*> word) const
+  {
+    return lookup_word(word, *input_vocab);
+  }
+
+  int lookup_output_word(std::pair<char const*, char const*> word) const
+  {
+    return lookup_word(word, *output_vocab);
+  }
+
+  double lookup_ngram(const int *ngram_a, int n)
+  {
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+    for (int i=0; i<m->ngram_size; i++)
     {
-        std::vector<std::string> input_words;
-        std::vector<std::string> output_words;
-        m->read(filename, input_words, output_words);
-        set_input_vocabulary(vocabulary(input_words));
-        set_output_vocabulary(vocabulary(output_words));
-        resize();
-	// this is faster but takes more memory
-        //m->premultiply();
+      if (i-m->ngram_size+n < 0)
+      {
+        if (ngram_a[0] == start)
+          ngram(i) = start;
+        else
+          ngram(i) = null;
+      }
+      else
+      {
+        ngram(i) = ngram_a[i-m->ngram_size+n];
+      }
     }
+    return neuralNetwork::lookup_ngram(ngram);
+  }
+
+  double lookup_ngram(const std::vector<int> &ngram_v)
+  {
+    return lookup_ngram(ngram_v.data(), ngram_v.size());
+  }
+
+  template <typename Derived>
+  double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+  {
+    return neuralNetwork::lookup_ngram(ngram);
+  }
+
+  template <typename DerivedA, typename DerivedB>
+  void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+  {
+    return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+  }
+
+  void read(const std::string &filename)
+  {
+    std::vector<std::string> input_words;
+    std::vector<std::string> output_words;
+    m->read(filename, input_words, output_words);
+    set_input_vocabulary(vocabulary(input_words));
+    set_output_vocabulary(vocabulary(output_words));
+    resize();
+    // this is faster but takes more memory
+    //m->premultiply();
+  }
 
 };
 
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
index adedc72..d5fc16b 100644
--- a/src/prepareNeuralLM.cpp
+++ b/src/prepareNeuralLM.cpp
@@ -2,19 +2,19 @@
 #include <vector>
 #include <queue>
 #include <deque>
-# include <fstream>
-# include <iterator>
-
-# include <boost/unordered_map.hpp>
-# include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <fstream>
+#include <iterator>
+
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
 #include <boost/interprocess/containers/vector.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/uniform_int_distribution.hpp>
 
-# include <tclap/CmdLine.h>
+#include <tclap/CmdLine.h>
 
 #include "neuralLM.h"
 #include "util.h"
@@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec;
 typedef long long int data_size_t; // training data can easily exceed 2G instances
 
 template<typename T>
-void writeNgrams(const T &data, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename)
-	{
-    ofstream file(filename.c_str());
-    if (!file)
+void writeNgrams(const T &data,
+                 int ngram_size,
+                 const vocabulary &vocab,
+                 bool numberize,
+                 bool add_start_stop,
+                 bool ngramize,
+                 const string &filename)
+{
+  ofstream file(filename.c_str());
+  if (!file)
+  {
+    cerr << "error: could not open " << filename << endl;
+    exit(1);
+  }
+
+  vector<vector<int> > ngrams;
+
+  for (int i=0; i<data.size(); i++) {
+    preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
+    // write out n-grams
+    for (int j=0; j<ngrams.size(); j++)
     {
-	cerr << "error: could not open " << filename << endl;
-	exit(1);
-    }
-
-    vector<vector<int> > ngrams;
-
-    for (int i=0; i<data.size(); i++) {
-        preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
-	// write out n-grams
-	for (int j=0; j<ngrams.size(); j++)
-	  {
-	    for (int k=0; k<ngram_size; k++)
-	      {
-	        file << ngrams[j][k] << " ";
-	      }
-	    file << endl;
-	  }
+      for (int k=0; k<ngram_size; k++)
+      {
+        file << ngrams[j][k] << " ";
+      }
+      file << endl;
     }
-    file.close();
+  }
+  file.close();
 }
 
 // Space efficient version for writing the n-grams.
 // They are not read into memory.
-void writeNgrams(const string &input_filename, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename,
-     int train_data_size,
-		 vector<float> &sent_weights,
-		 const string &sent_weights_filename)
+void writeNgrams(const string &input_filename,
+                 int ngram_size,
+                 const vocabulary &vocab,
+                 bool numberize,
+                 bool add_start_stop,
+                 bool ngramize,
+                 const string &filename,
+                 int train_data_size,
+                 vector<float> &sent_weights,
+                 const string &sent_weights_filename)
 {
-    ofstream file(filename.c_str());
-    ofstream output_sent_weights_file(sent_weights_filename.c_str());
-    if (!file)
-    {
-      cerr << "error: could not open " << filename << endl;
-      exit(1);
+  ofstream file(filename.c_str());
+  ofstream output_sent_weights_file(sent_weights_filename.c_str());
+  if (!file)
+  {
+    cerr << "error: could not open " << filename << endl;
+    exit(1);
+  }
+
+  ifstream input_file(input_filename.c_str());
+  vector<vector<int> > ngrams;
+  //for (int i=0; i<train_data.size(); i++) {
+  string line;
+  int counter = 0;
+  cerr<<"Processed ... ";
+  while (getline(input_file,line) && train_data_size-- > 0) {
+    counter++;
+    if ((counter % 100000) == 0) {
+      cerr<<counter<<" training lines ... ";
     }
-
-    ifstream input_file(input_filename.c_str());
-    vector<vector<int> > ngrams;
-    //for (int i=0; i<train_data.size(); i++) {
-    string line;
-    int counter = 0;
-    cerr<<"Processed ... ";
-    while (getline(input_file,line) && train_data_size-- > 0) {
-            counter++;
-      if ((counter % 100000) == 0) {
-        cerr<<counter<<" training lines ... ";
-      }
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
 
     //for (int i=0; i<data.size(); i++) {
-      preprocessWords(lstr_items,
-          ngrams,
-          ngram_size,
-          vocab,
-          numberize,
-          add_start_stop,
-          ngramize);
-
-	    // write out n-grams
-	    for (int j=0; j<ngrams.size(); j++)
-	    {
-					if (sent_weights.size() != 0) {
-						output_sent_weights_file <<sent_weights[counter-1]<<endl;
-					}	
-	        for (int k=0; k<ngram_size; k++)
-	        {
-	        file << ngrams[j][k] << " ";
-	        }
-	      file << endl;
-	    }
+    preprocessWords(lstr_items,
+                    ngrams,
+                    ngram_size,
+                    vocab,
+                    numberize,
+                    add_start_stop,
+                    ngramize);
+
+    // write out n-grams
+    for (int j=0; j<ngrams.size(); j++)
+    {
+      if (sent_weights.size() != 0) {
+        output_sent_weights_file <<sent_weights[counter-1]<<endl;
+      }
+      for (int k=0; k<ngram_size; k++)
+      {
+        file << ngrams[j][k] << " ";
+      }
+      file << endl;
     }
-    cerr<<endl;
-    input_file.close();
-    file.close();
-    output_sent_weights_file.close();
+  }
+  cerr<<endl;
+  input_file.close();
+  file.close();
+  output_sent_weights_file.close();
 }
 
 // Space efficient version for writing the n-grams.
 // They are not read into memory.
-void writeMmapNgrams(const string &input_filename, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename,
-     unsigned long train_data_size,
-     data_size_t num_tokens,
-     bool randomize)
+void writeMmapNgrams(const string &input_filename,
+                     int ngram_size,
+                     const vocabulary &vocab,
+                     bool numberize,
+                     bool add_start_stop,
+                     bool ngramize,
+                     const string &filename,
+                     unsigned long train_data_size,
+                     data_size_t num_tokens,
+                     bool randomize)
 {
-    cerr<<"Num tokens is "<<num_tokens<<endl;
-    cerr<<"Training data size is "<<train_data_size<<endl;
-    // Open the memory mapped file and create the allocators
-    ip::managed_mapped_file mfile(ip::create_only,
-        filename.c_str(),
-        num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
-    intAllocator ialloc(mfile.get_segment_manager());
-    vecAllocator valloc (mfile.get_segment_manager());
-    //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
-
-    vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
-
-    cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
-    // Going over every line in the input file and 
-    // printing the memory mapped ngrams into the 
-    // output file
-    ifstream input_file(input_filename.c_str());
-    //for (int i=0; i<train_data.size(); i++) {
-    string line;
-    int counter = 0;
-    cerr<<"Processed ... ";
-    long int train_ngram_counter = 0;
-    vector<vector<int> > ngrams;
-    while (getline(input_file,line) && train_data_size-- > 0) {
-            counter++;
-      if ((counter % 100000) ==0) {
-        //cerr<<"counter is "<<counter<<endl;
-        cerr<<counter<<" training lines ... ";
-      }
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
+  cerr<<"Num tokens is "<<num_tokens<<endl;
+  cerr<<"Training data size is "<<train_data_size<<endl;
+  // Open the memory mapped file and create the allocators
+  ip::managed_mapped_file mfile(ip::create_only,
+                                filename.c_str(),
+                                num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+  intAllocator ialloc(mfile.get_segment_manager());
+  vecAllocator valloc (mfile.get_segment_manager());
+  //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
+
+  vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+  cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
+  // Going over every line in the input file and
+  // printing the memory mapped ngrams into the
+  // output file
+  ifstream input_file(input_filename.c_str());
+  //for (int i=0; i<train_data.size(); i++) {
+  string line;
+  int counter = 0;
+  cerr<<"Processed ... ";
+  long int train_ngram_counter = 0;
+  vector<vector<int> > ngrams;
+  while (getline(input_file,line) && train_data_size-- > 0) {
+    counter++;
+    if ((counter % 100000) ==0) {
+      //cerr<<"counter is "<<counter<<endl;
+      cerr<<counter<<" training lines ... ";
+    }
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
 
     //for (int i=0; i<data.size(); i++) {
-      preprocessWords(lstr_items, ngrams,
-          ngram_size,
-          vocab,
-          numberize, 
-          add_start_stop,
-          ngramize);
-      /*
+    preprocessWords(lstr_items, ngrams,
+                    ngram_size,
+                    vocab,
+                    numberize,
+                    add_start_stop,
+                    ngramize);
+    /*
       cerr<<"line is "<<endl;
       cerr<<line<<endl;
       cerr<<"Number of ngrams is "<<ngrams.size()<<endl;
-        if (ngrams.size() ==1 ){
-          cerr<<"The line number was "<<counter<<endl;
-          cerr<<line<<endl;
+      if (ngrams.size() ==1 ){
+      cerr<<"The line number was "<<counter<<endl;
+      cerr<<line<<endl;
+      }
+    */
+    // write out n-grams in mmapped file
+    for (int j=0; j<ngrams.size(); j++)
+    {
+      /*
+        for (int k=0; k<ngram_size; k++)
+        {
+        cerr << ngrams[j][k] << " ";
         }
+        cerr<< endl;
       */
-	    // write out n-grams in mmapped file
-	    for (int j=0; j<ngrams.size(); j++)
-	    {
-        /*
-       for (int k=0; k<ngram_size; k++)
-	        {
-	        cerr << ngrams[j][k] << " ";
-	        }
-	      cerr<< endl; 
-        */
-        for (int k=0; k<ngram_size; k++) {
-          mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
-        }
-        train_ngram_counter++;
-        //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
-	    }
+      for (int k=0; k<ngram_size; k++) {
+        mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
+      }
+      train_ngram_counter++;
+      //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
     }
-    cerr<<endl;
-    input_file.close();
-
-    // Shrink the file if it was overused
-    ip::managed_mapped_file::shrink_to_fit(filename.c_str());
-    //now to randomize the items if the randomize flag was set
-    if (randomize == true) {
-      unsigned seed = 1234; //for testing only
-      mt19937 rng(seed);
-       cerr<<"Randomly shuffling data...";
-        data_size_t counter =0;
-        while (counter < num_tokens) {
-          data_size_t upper_limit = counter+5000000;
-          long int vector_size = 5000000;
-          if (counter + 10000000 >= num_tokens) {
-            upper_limit = num_tokens;
-            vector_size = num_tokens - counter;
-          }
-          vector<int> temp(vector_size*ngram_size,0);
-          for (int i=0;i<vector_size;i++){
-           for (int k=0;k<ngram_size;k++) {
-             temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
-           }
-          }
-          for (data_size_t i=vector_size-1; i>0; i--)
-          {
-            if (i %500000 == 0) {
-              cerr<<"Shuffled "<<num_tokens-1<<" instances...";
-            }
-            data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
-            for (int k=0;k<ngram_size;k++) {
-              int temp_val = temp.at(i*ngram_size+k);
-              temp.at(i*ngram_size+k) =
-                temp.at(j*ngram_size+k);
-              temp.at(j*ngram_size+k) = temp_val;
-            }
-          }
-          //Putting it back
-          for (int i=0;i<vector_size;i++){
-           for (int k=0;k<ngram_size;k++) {
-             mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
-           }
-          }
-          counter = upper_limit;
+  }
+  cerr<<endl;
+  input_file.close();
+
+  // Shrink the file if it was overused
+  ip::managed_mapped_file::shrink_to_fit(filename.c_str());
+  //now to randomize the items if the randomize flag was set
+  if (randomize == true) {
+    unsigned seed = 1234; //for testing only
+    boost::random::mt19937 rng(seed);
+    cerr<<"Randomly shuffling data...";
+    data_size_t counter =0;
+    while (counter < num_tokens) {
+      data_size_t upper_limit = counter+5000000;
+      long int vector_size = 5000000;
+      if (counter + 10000000 >= num_tokens) {
+        upper_limit = num_tokens;
+        vector_size = num_tokens - counter;
+      }
+      vector<int> temp(vector_size*ngram_size,0);
+      for (int i=0;i<vector_size;i++){
+        for (int k=0;k<ngram_size;k++) {
+          temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
         }
-
-      /*
-      for (data_size_t i=num_tokens-1; i>0; i--)
+      }
+      for (data_size_t i=vector_size-1; i>0; i--)
       {
         if (i %500000 == 0) {
           cerr<<"Shuffled "<<num_tokens-1<<" instances...";
         }
         data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
         for (int k=0;k<ngram_size;k++) {
-          int temp_val = mMapVec->at(i*ngram_size+k);
-          mMapVec->at(i*ngram_size+k) =
-            mMapVec->at(j*ngram_size+k);
-          mMapVec->at(j*ngram_size+k) = temp_val;
+          int temp_val = temp.at(i*ngram_size+k);
+          temp.at(i*ngram_size+k) =
+              temp.at(j*ngram_size+k);
+          temp.at(j*ngram_size+k) = temp_val;
         }
       }
-      */
-    cerr<<endl; 
+      //Putting it back
+      for (int i=0;i<vector_size;i++){
+        for (int k=0;k<ngram_size;k++) {
+          mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
+        }
+      }
+      counter = upper_limit;
     }
+
+    /*
+      for (data_size_t i=num_tokens-1; i>0; i--)
+      {
+      if (i %500000 == 0) {
+      cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+      }
+      data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
+      for (int k=0;k<ngram_size;k++) {
+      int temp_val = mMapVec->at(i*ngram_size+k);
+      mMapVec->at(i*ngram_size+k) =
+      mMapVec->at(j*ngram_size+k);
+      mMapVec->at(j*ngram_size+k) = temp_val;
+      }
+      }
+    */
+    cerr<<endl;
+  }
 }
 
 
 int main(int argc, char *argv[])
 {
-    ios::sync_with_stdio(false);
-    int ngram_size, vocab_size, validation_size;
-    bool numberize, 
-         ngramize,
-         add_start_stop,
-         mmap_file,
-         randomize;
-
-    string train_text,
-           train_file,
-           validation_text,
-           validation_file,
-           words_file,
-           write_words_file,
-					 sent_weights_text,
-					 output_sent_weights_text;
-
-    try
-    {
-	CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
-
-	// The options are printed in reverse order
+  ios::sync_with_stdio(false);
+  int ngram_size, vocab_size, validation_size;
+  bool numberize,
+      ngramize,
+      add_start_stop,
+      mmap_file,
+      randomize;
+
+  string train_text,
+      train_file,
+      validation_text,
+      validation_file,
+      words_file,
+      write_words_file,
+      sent_weights_text,
+      output_sent_weights_text;
+
+  try
+  {
+    CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+    // The options are printed in reverse order
 
     ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is "
-        "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
+                                 "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
 
     ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd);
 
     ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
     ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
     ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
-	ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
+    ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
     ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
-	ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
-	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
-	//ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
-  //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
-
-
-
-	cmd.parse(argc, argv);
-
-	train_text = arg_train_text.getValue();
-	train_file = arg_train_file.getValue();
-	validation_text = arg_validation_text.getValue();
-	validation_file = arg_validation_file.getValue();
-	validation_size = arg_validation_size.getValue();
-	write_words_file = arg_write_words_file.getValue();
-	ngram_size = arg_ngram_size.getValue();
-	vocab_size = arg_vocab_size.getValue();
-	words_file = arg_words_file.getValue();
-	numberize = arg_numberize.getValue();
-	ngramize = arg_ngramize.getValue();
-	add_start_stop = arg_add_start_stop.getValue();
-  mmap_file = arg_mmap_file.getValue();
-  randomize = arg_randomize.getValue();
-  //sent_weights_text = arg_sent_weights_text.getValue();
-  //output_sent_weights_text = arg_sent_weights_file.getValue();
-  sent_weights_text = "";
-  output_sent_weights_text = "";
+    ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+    ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+    //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
+    //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
+
+
+    cmd.parse(argc, argv);
+
+    train_text = arg_train_text.getValue();
+    train_file = arg_train_file.getValue();
+    validation_text = arg_validation_text.getValue();
+    validation_file = arg_validation_file.getValue();
+    validation_size = arg_validation_size.getValue();
+    write_words_file = arg_write_words_file.getValue();
+    ngram_size = arg_ngram_size.getValue();
+    vocab_size = arg_vocab_size.getValue();
+    words_file = arg_words_file.getValue();
+    numberize = arg_numberize.getValue();
+    ngramize = arg_ngramize.getValue();
+    add_start_stop = arg_add_start_stop.getValue();
+    mmap_file = arg_mmap_file.getValue();
+    randomize = arg_randomize.getValue();
+    //sent_weights_text = arg_sent_weights_text.getValue();
+    //output_sent_weights_text = arg_sent_weights_file.getValue();
+    sent_weights_text = "";
+    output_sent_weights_text = "";
 
 
     // check command line arguments
@@ -364,292 +363,292 @@ int main(int argc, char *argv[])
 
     cerr << "Command line: " << endl;
     cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-	
-	const string sep(" Value: ");
-	cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
-	cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
-	cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
-	cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
-	cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
-	cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
-	cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
-	cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
-	cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
-	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
-	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
-	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
-	cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
-	//cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
-    }
-    catch (TCLAP::ArgException &e)
-    {
-      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
-      exit(1);
-    }
 
-    // VLF: why is this true?
-    // DC: it's because the vocabulary has to be constructed from the training data only.
-    // If the vocabulary is preset, we can't create the validation data.
-    // - if --numberize 0 is set, then --validation_size cannot be used.
-    // if (!numberize && (validation_size > 0)) {
-    //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
-    // }
-
-    // Read in training data and validation data
-    // vector<vector<string> > train_data;
-    // readSentFile(train_text, train_data);
-    // @vaswani: No more reading the entire training file into memory
-    // Reading it per line with file io
-    
-    //for (int i=0; i<train_data.size(); i++) {
-    // Go over every line in the file and 
-    // 1. if the !ngramize then you should check if 
-    // we have the correct number of items per line
-    // 2. build the vocabulary if the words file has not
-    // been specified.
-    // Construct vocabulary
-    vocabulary vocab;
-    int start, stop;
-    // Add start stop if the vocabulary has not been supplied
-    if (words_file == "") {
-      vocab.insert_word("<s>");
-	    vocab.insert_word("</s>");
-	    vocab.insert_word("<null>");
-      // warn user that if --numberize is not set, there will be no vocabulary!
-      if (!numberize) {
-          cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
-      }
-    }
-    if (mmap_file == false && randomize == true) {
-      cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
-      exit(1);
+    const string sep(" Value: ");
+    cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
+    cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+    cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
+    cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+    cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+    cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
+    cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+    cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
+    cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
+    cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+    cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+    cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
+    //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
+  }
+  catch (TCLAP::ArgException &e)
+  {
+    cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+    exit(1);
+  }
+
+  // VLF: why is this true?
+  // DC: it's because the vocabulary has to be constructed from the training data only.
+  // If the vocabulary is preset, we can't create the validation data.
+  // - if --numberize 0 is set, then --validation_size cannot be used.
+  // if (!numberize && (validation_size > 0)) {
+  //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
+  // }
+
+  // Read in training data and validation data
+  // vector<vector<string> > train_data;
+  // readSentFile(train_text, train_data);
+  // @vaswani: No more reading the entire training file into memory
+  // Reading it per line with file io
+
+  //for (int i=0; i<train_data.size(); i++) {
+  // Go over every line in the file and
+  // 1. if the !ngramize then you should check if
+  // we have the correct number of items per line
+  // 2. build the vocabulary if the words file has not
+  // been specified.
+  // Construct vocabulary
+  vocabulary vocab;
+  int start, stop;
+  // Add start stop if the vocabulary has not been supplied
+  if (words_file == "") {
+    vocab.insert_word("<s>");
+    vocab.insert_word("</s>");
+    vocab.insert_word("<null>");
+    // warn user that if --numberize is not set, there will be no vocabulary!
+    if (!numberize) {
+      cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
     }
-    unordered_map<string,int> count; // For keeping word counts if no supplied vocab
-
-    deque<vector<string> > validation_data;
-    int train_data_size=0;
-    cerr<<"Processed ... ";
-    data_size_t num_tokens=0;
-    
-    ifstream training(train_text.c_str());
-
-    string line;
-    while (getline(training,line)) {
-      train_data_size++;
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
-      // if data is already ngramized, set/check ngram_size
-      if (!ngramize) {
-          if (ngram_size > 0) {
-              if (ngram_size != lstr_items.size()) {
-                  cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
-              }
-          }
-          // else if --ngram_size has not been specified, set it now
-          else {
-              ngram_size=lstr_items.size();
-          }
+  }
+  if (mmap_file == false && randomize == true) {
+    cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
+    exit(1);
+  }
+  unordered_map<string,int> count; // For keeping word counts if no supplied vocab
+
+  deque<vector<string> > validation_data;
+  int train_data_size=0;
+  cerr<<"Processed ... ";
+  data_size_t num_tokens=0;
+
+  ifstream training(train_text.c_str());
+
+  string line;
+  while (getline(training,line)) {
+    train_data_size++;
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
+    // if data is already ngramized, set/check ngram_size
+    if (!ngramize) {
+      if (ngram_size > 0) {
+        if (ngram_size != lstr_items.size()) {
+          cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+        }
       }
-      if ((train_data_size%100000)==0){
-        cerr<<train_data_size<<" lines ... ";
+      // else if --ngram_size has not been specified, set it now
+      else {
+        ngram_size=lstr_items.size();
       }
-      //break;
-      /*
+    }
+    if ((train_data_size%100000)==0){
+      cerr<<train_data_size<<" lines ... ";
+    }
+    //break;
+    /*
       if (lstr_items.size() ==1) {
-        cerr<<"line :"<<endl;
-        cerr<<line<<endl;
-        cerr<<"The number of items was 1"<<endl;
-        getchar();
-      }
-      */
-      num_tokens += lstr_items.size()+1;
-      if (words_file == "") {
-         for (int j=0; j<lstr_items.size(); j++) {
-              count[lstr_items[j]] += 1; 
-          }
+      cerr<<"line :"<<endl;
+      cerr<<line<<endl;
+      cerr<<"The number of items was 1"<<endl;
+      getchar();
       }
-      // Add to validation set if the validation size
-      // has not been specified
-      if (validation_text == "" && validation_size > 0) {
-        //cerr<<"validation size is "<<validation_data.size()<<endl;
-        if (validation_data.size() == validation_size) {
-          //validation_data.erase(validation_data.begin());
-          validation_data.pop_front();
-        }
-        validation_data.push_back(lstr_items);
+    */
+    num_tokens += lstr_items.size()+1;
+    if (words_file == "") {
+      for (int j=0; j<lstr_items.size(); j++) {
+        count[lstr_items[j]] += 1;
       }
     }
-    cerr<<endl;
-    training.close();
-    //cerr<<"validation size is "<<validation_data.size()<<endl;
-    //getchar();
-    if (validation_data.size() < validation_size) {
-      cerr<<"validation size is "<<validation_data.size()<<endl;
-      cerr << "error: requested validation size is greater than training data size" << endl;
-      exit(1);
+    // Add to validation set if the validation size
+    // has not been specified
+    if (validation_text == "" && validation_size > 0) {
+      //cerr<<"validation size is "<<validation_data.size()<<endl;
+      if (validation_data.size() == validation_size) {
+        //validation_data.erase(validation_data.begin());
+        validation_data.pop_front();
+      }
+      validation_data.push_back(lstr_items);
     }
-    
-    train_data_size -= validation_size; 
-    cerr<<"Training data size is "<<train_data_size<<endl;
-
-    // The items in the validation data have already been counted
-    // Decrementing the counts of those words before building the vocabulary
-    for(int i=0; i<validation_data.size(); i++){
-      num_tokens -= (validation_data[i].size() +1);
-      for (int j=0; j<validation_data[i].size();j++){
-        count[validation_data[i][j]] -= 1;
-        if (count[validation_data[i][j]] == 0) {
-          count.erase(validation_data[i][j]);
-        }
+  }
+  cerr<<endl;
+  training.close();
+  //cerr<<"validation size is "<<validation_data.size()<<endl;
+  //getchar();
+  if (validation_data.size() < validation_size) {
+    cerr<<"validation size is "<<validation_data.size()<<endl;
+    cerr << "error: requested validation size is greater than training data size" << endl;
+    exit(1);
+  }
+
+  train_data_size -= validation_size;
+  cerr<<"Training data size is "<<train_data_size<<endl;
+
+  // The items in the validation data have already been counted
+  // Decrementing the counts of those words before building the vocabulary
+  for(int i=0; i<validation_data.size(); i++){
+    num_tokens -= (validation_data[i].size() +1);
+    for (int j=0; j<validation_data[i].size();j++){
+      count[validation_data[i][j]] -= 1;
+      if (count[validation_data[i][j]] == 0) {
+        count.erase(validation_data[i][j]);
       }
     }
+  }
 
-    // Getting the top n frequent words for the vocabulary
-    if (words_file == "") {
-      vocab.insert_most_frequent(count, vocab_size);
-      if (vocab.size() < vocab_size) {
-          cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
-      }
+  // Getting the top n frequent words for the vocabulary
+  if (words_file == "") {
+    vocab.insert_most_frequent(count, vocab_size);
+    if (vocab.size() < vocab_size) {
+      cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
     }
-    //vector<vector<string> > validation_data;
-    if (validation_text != "") {
-        readSentFile(validation_text, validation_data);
-        for (int i=0; i<validation_data.size(); i++) {
-	    // if data is already ngramized, set/check ngram_size
-            if (!ngramize) {
-                // if --ngram_size has been specified, check that it does not conflict with --ngram_size
-                if (ngram_size > 0) {
-                    if (ngram_size != validation_data[i].size()) {
-                        cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
-                    }
-                }
-                // else if --ngram_size has not been specified, set it now
-                else {
-                    ngram_size=validation_data[i].size();
-                }
-            }
+  }
+  //vector<vector<string> > validation_data;
+  if (validation_text != "") {
+    readSentFile(validation_text, validation_data);
+    for (int i=0; i<validation_data.size(); i++) {
+      // if data is already ngramized, set/check ngram_size
+      if (!ngramize) {
+        // if --ngram_size has been specified, check that it does not conflict with --ngram_size
+        if (ngram_size > 0) {
+          if (ngram_size != validation_data[i].size()) {
+            cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
+          }
         }
+        // else if --ngram_size has not been specified, set it now
+        else {
+          ngram_size=validation_data[i].size();
+        }
+      }
     }
-    //READING SENTENCE WEIGHTS IF THERE ARE ANY
-    vector<float> sent_weights;
-    if (sent_weights_text != "") {
-      cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
-      ifstream sent_weights_file(sent_weights_text.c_str());
-			string line;
-      readWeightsFile(sent_weights_file,sent_weights);
-			sent_weights_file.close();
-			if (sent_weights_text.size() != train_data_size) {
-				cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
-			}
+  }
+  //READING SENTENCE WEIGHTS IF THERE ARE ANY
+  vector<float> sent_weights;
+  if (sent_weights_text != "") {
+    cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
+    ifstream sent_weights_file(sent_weights_text.c_str());
+    string line;
+    readWeightsFile(sent_weights_file,sent_weights);
+    sent_weights_file.close();
+    if (sent_weights_text.size() != train_data_size) {
+      cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
     }
-		
-    /*
+  }
+
+  /*
     else if (validation_size > 0)
     {
-      // Create validation data
-      if (validation_size > train_data.size())
-      {
-          cerr << "error: requested validation size is greater than training data size" << endl;
-          exit(1);
-      }
-	    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
-	    train_data.resize(train_data.size() - validation_size);
+    // Create validation data
+    if (validation_size > train_data.size())
+    {
+    cerr << "error: requested validation size is greater than training data size" << endl;
+    exit(1);
     }
-    */
-
-    // Construct vocabulary
-    //vocabulary vocab;
-    //int start, stop;
-    
-    // read vocabulary from file
-    if (words_file != "") {
-        vector<string> words;
-        readWordsFile(words_file,words);
-        for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
-            vocab.insert_word(*it);
-        }
-
-        // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
-        if (vocab_size > 0) {
-            if (vocab.size() != vocab_size) {
-                cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
-            }
-        }
-        // else, set it to the size of vocabulary read from file
-        else {
-            vocab_size = vocab.size();
-        }
-
+    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+    train_data.resize(train_data.size() - validation_size);
     }
-    /*
-    // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
-    else {
-      vocab.insert_word("<s>");
-	    vocab.insert_word("</s>");
-	    vocab.insert_word("<null>");
-
-        // warn user that if --numberize is not set, there will be no vocabulary!
-        if (!numberize) {
-            cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
-        }
-        unordered_map<string,int> count;
-        for (int i=0; i<train_data.size(); i++) {
-            for (int j=0; j<train_data[i].size(); j++) {
-                count[train_data[i][j]] += 1; 
-            }
-        }
-
-        vocab.insert_most_frequent(count, vocab_size);
-        if (vocab.size() < vocab_size) {
-            cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
-        }
+  */
+
+  // Construct vocabulary
+  //vocabulary vocab;
+  //int start, stop;
+
+  // read vocabulary from file
+  if (words_file != "") {
+    vector<string> words;
+    readWordsFile(words_file,words);
+    for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+      vocab.insert_word(*it);
     }
-    */
 
-    // write vocabulary to file
-    if (write_words_file != "") {
-        cerr << "Writing vocabulary to " << write_words_file << endl;
-        writeWordsFile(vocab.words(), write_words_file);
+    // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+    if (vocab_size > 0) {
+      if (vocab.size() != vocab_size) {
+        cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
+      }
     }
-
-    // Write out numberized n-grams
-    if (train_file != "")
-    {
-        cerr << "Writing training data to " << train_file << endl;
-        if (mmap_file == true) {
-          writeMmapNgrams(train_text,
-            ngram_size,
-            vocab,
-            numberize,
-            add_start_stop,
-            ngramize,
-            train_file,
-            train_data_size,
-            num_tokens,
-            randomize);
-        } else {
-          writeNgrams(train_text,
-              ngram_size,
-              vocab,
-              numberize,
-              add_start_stop,
-              ngramize,
-              train_file,
-              train_data_size,
-							sent_weights,
-							output_sent_weights_text);
-        }
+    // else, set it to the size of vocabulary read from file
+    else {
+      vocab_size = vocab.size();
     }
-    if (validation_file != "")
-    {
-        cerr << "Writing validation data to " << validation_file << endl;
-        writeNgrams(validation_data,
-            ngram_size,
-            vocab,
-            numberize,
-            add_start_stop,
-            ngramize,
-            validation_file);
+
+  }
+  /*
+  // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
+  else {
+  vocab.insert_word("<s>");
+  vocab.insert_word("</s>");
+  vocab.insert_word("<null>");
+
+  // warn user that if --numberize is not set, there will be no vocabulary!
+  if (!numberize) {
+  cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+  }
+  unordered_map<string,int> count;
+  for (int i=0; i<train_data.size(); i++) {
+  for (int j=0; j<train_data[i].size(); j++) {
+  count[train_data[i][j]] += 1;
+  }
+  }
+
+  vocab.insert_most_frequent(count, vocab_size);
+  if (vocab.size() < vocab_size) {
+  cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+  }
+  }
+  */
+
+  // write vocabulary to file
+  if (write_words_file != "") {
+    cerr << "Writing vocabulary to " << write_words_file << endl;
+    writeWordsFile(vocab.words(), write_words_file);
+  }
+
+  // Write out numberized n-grams
+  if (train_file != "")
+  {
+    cerr << "Writing training data to " << train_file << endl;
+    if (mmap_file == true) {
+      writeMmapNgrams(train_text,
+                      ngram_size,
+                      vocab,
+                      numberize,
+                      add_start_stop,
+                      ngramize,
+                      train_file,
+                      train_data_size,
+                      num_tokens,
+                      randomize);
+    } else {
+      writeNgrams(train_text,
+                  ngram_size,
+                  vocab,
+                  numberize,
+                  add_start_stop,
+                  ngramize,
+                  train_file,
+                  train_data_size,
+                  sent_weights,
+                  output_sent_weights_text);
     }
+  }
+  if (validation_file != "")
+  {
+    cerr << "Writing validation data to " << validation_file << endl;
+    writeNgrams(validation_data,
+                ngram_size,
+                vocab,
+                numberize,
+                add_start_stop,
+                ngramize,
+                validation_file);
+  }
 }
diff --git a/src/propagator.h b/src/propagator.h
index 9f214de..6344f2f 100644
--- a/src/propagator.h
+++ b/src/propagator.h
@@ -13,360 +13,359 @@ using Eigen::MatrixBase;
 using Eigen::Dynamic;
 
 class propagator {
-    int minibatch_size;
-    model *pnn;
-
-public:
-    Node<Input_word_embeddings> input_layer_node;
-    Node<Linear_layer> first_hidden_linear_node;
-    Node<Activation_function> first_hidden_activation_node;
-    Node<Linear_layer> second_hidden_linear_node;
-    Node<Activation_function> second_hidden_activation_node;
-    Node<Output_word_embeddings> output_layer_node;
-    bool skip_hidden;
-
-public:
-    propagator () : minibatch_size(0), pnn(0) { }
-
-    propagator (model &nn, int minibatch_size)
+  int minibatch_size;
+  model *pnn;
+
+ public:
+  Node<Input_word_embeddings> input_layer_node;
+  Node<Linear_layer> first_hidden_linear_node;
+  Node<Activation_function> first_hidden_activation_node;
+  Node<Linear_layer> second_hidden_linear_node;
+  Node<Activation_function> second_hidden_activation_node;
+  Node<Output_word_embeddings> output_layer_node;
+  bool skip_hidden;
+
+ public:
+  propagator () : minibatch_size(0), pnn(0) { }
+
+  propagator (model &nn, int minibatch_size)
       :
-        pnn(&nn),
-        input_layer_node(&nn.input_layer, minibatch_size),
-	first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
-	first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
-        second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
-	second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
-	output_layer_node(&nn.output_layer, minibatch_size),
-	minibatch_size(minibatch_size)
-    {
-        skip_hidden = (nn.num_hidden == 0);
-    }
+      pnn(&nn),
+      input_layer_node(&nn.input_layer, minibatch_size),
+      first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
+      first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
+      second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
+      second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
+      output_layer_node(&nn.output_layer, minibatch_size),
+      minibatch_size(minibatch_size)
+  {
+    skip_hidden = (nn.num_hidden == 0);
+  }
 
-    // This must be called if the underlying model is resized.
-    void resize(int minibatch_size) {
-      this->minibatch_size = minibatch_size;
-      input_layer_node.resize(minibatch_size);
-      first_hidden_linear_node.resize(minibatch_size);
-      first_hidden_activation_node.resize(minibatch_size);
-      second_hidden_linear_node.resize(minibatch_size);
-      second_hidden_activation_node.resize(minibatch_size);
-      output_layer_node.resize(minibatch_size);
-    }
+  // This must be called if the underlying model is resized.
+  void resize(int minibatch_size) {
+    this->minibatch_size = minibatch_size;
+    input_layer_node.resize(minibatch_size);
+    first_hidden_linear_node.resize(minibatch_size);
+    first_hidden_activation_node.resize(minibatch_size);
+    second_hidden_linear_node.resize(minibatch_size);
+    second_hidden_activation_node.resize(minibatch_size);
+    output_layer_node.resize(minibatch_size);
+  }
 
-    void resize() { resize(minibatch_size); }
+  void resize() { resize(minibatch_size); }
 
-    template <typename Derived>
-    void fProp(const MatrixBase<Derived> &data)
+  template <typename Derived>
+  void fProp(const MatrixBase<Derived> &data)
+  {
+    if (!pnn->premultiplied)
     {
-        if (!pnn->premultiplied)
-	{
-            start_timer(0);
-	    input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
-	    stop_timer(0);
-	    
-	    start_timer(1);
-	    first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, 
-						  first_hidden_linear_node.fProp_matrix);
-	} 
-	else
-	{
-	    int n_inputs = first_hidden_linear_node.param->n_inputs();
-	    USCMatrix<double> sparse_data;
-	    input_layer_node.param->munge(data, sparse_data);
-
-	    start_timer(1);
-	    first_hidden_linear_node.param->fProp(sparse_data,
-						  first_hidden_linear_node.fProp_matrix);
-	}
-	first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
-						  first_hidden_activation_node.fProp_matrix);
-  //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
-  //std::getchar();
-	stop_timer(1);
-    
-
-        if (!skip_hidden) {
-	start_timer(2);
-	second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
-					       second_hidden_linear_node.fProp_matrix);
-	second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
-						   second_hidden_activation_node.fProp_matrix);
-	stop_timer(2);
-        }
-
-	// The propagation stops here because the last layer is very expensive.
-    }
+      start_timer(0);
+      input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
+      stop_timer(0);
 
-    // Dense version (for standard log-likelihood)
-    template <typename DerivedIn, typename DerivedOut>
-    void bProp(const MatrixBase<DerivedIn> &data,
-	       const MatrixBase<DerivedOut> &output,
-	       double learning_rate,
-         double momentum,
-         double L2_reg,
-         std::string &parameter_update,
-         double conditioning_constant,
-         double decay) 
+      start_timer(1);
+      first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix,
+                                            first_hidden_linear_node.fProp_matrix);
+    }
+    else
     {
-        // Output embedding layer
-
-        start_timer(7);
-        output_layer_node.param->bProp(output,
-				       output_layer_node.bProp_matrix);
-	stop_timer(7);
-	
-	start_timer(8);
-  Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
-  if (parameter_update == "SGD") {
-    output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
-               output,
-               learning_rate,
-               momentum);
-  } else if (parameter_update == "ADA") {
-    output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
-               output,
-               learning_rate);
-  } else if (parameter_update == "ADAD") {
-    //std::cerr<<"Adadelta gradient"<<endl;
-    int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
-    output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
-               output,
-               1.0/current_minibatch_size,
-               conditioning_constant,
-               decay);
-  } else {
-    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
-  }
-	stop_timer(8);
-
-	bPropRest(data, 
-      learning_rate,
-      momentum,
-      L2_reg,
-      parameter_update,
-      conditioning_constant,
-      decay);
+      int n_inputs = first_hidden_linear_node.param->n_inputs();
+      USCMatrix<double> sparse_data;
+      input_layer_node.param->munge(data, sparse_data);
+
+      start_timer(1);
+      first_hidden_linear_node.param->fProp(sparse_data,
+                                            first_hidden_linear_node.fProp_matrix);
+    }
+    first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
+                                              first_hidden_activation_node.fProp_matrix);
+    //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
+    //std::getchar();
+    stop_timer(1);
+
+
+    if (!skip_hidden) {
+      start_timer(2);
+      second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
+                                             second_hidden_linear_node.fProp_matrix);
+      second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
+                                                 second_hidden_activation_node.fProp_matrix);
+      stop_timer(2);
     }
 
-    // Sparse version (for NCE log-likelihood)
-    template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
-    void bProp(const MatrixBase<DerivedIn> &data,
-	       const MatrixBase<DerivedOutI> &samples,
-         const MatrixBase<DerivedOutV> &weights,
-	       double learning_rate,
-         double momentum,
-         double L2_reg,
-         std::string &parameter_update,
-         double conditioning_constant,
-         double decay) 
-    {
+    // The propagation stops here because the last layer is very expensive.
+  }
+
+  // Dense version (for standard log-likelihood)
+  template <typename DerivedIn, typename DerivedOut>
+  void bProp(const MatrixBase<DerivedIn> &data,
+             const MatrixBase<DerivedOut> &output,
+             double learning_rate,
+             double momentum,
+             double L2_reg,
+             std::string &parameter_update,
+             double conditioning_constant,
+             double decay)
+  {
+    // Output embedding layer
+
+    start_timer(7);
+    output_layer_node.param->bProp(output,
+                                   output_layer_node.bProp_matrix);
+    stop_timer(7);
+
+    start_timer(8);
+    Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
+    if (parameter_update == "SGD") {
+      output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
+                                               output,
+                                               learning_rate,
+                                               momentum);
+    } else if (parameter_update == "ADA") {
+      output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
+                                                      output,
+                                                      learning_rate);
+    } else if (parameter_update == "ADAD") {
+      //std::cerr<<"Adadelta gradient"<<endl;
+      int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+      output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
+                                                       output,
+                                                       1.0/current_minibatch_size,
+                                                       conditioning_constant,
+                                                       decay);
+    } else {
+      std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+    }
+    stop_timer(8);
 
-        // Output embedding layer
-
-        start_timer(7);
-        output_layer_node.param->bProp(samples,
-            weights, 
-				    output_layer_node.bProp_matrix);
-	stop_timer(7);
-	
-
-	start_timer(8);
-  Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
-  if (parameter_update == "SGD") {
-    output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
-               samples,
-               weights,
-               learning_rate,
-               momentum);
-  } else if (parameter_update == "ADA") {
-    output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
-               samples,
-               weights,
-               learning_rate);
-  } else if (parameter_update == "ADAD") {
-    int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
-    //std::cerr<<"Adadelta gradient"<<endl;
-    output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
-               samples,
-               weights,
-               1.0/current_minibatch_size,
-               conditioning_constant,
-               decay);
-  } else {
-    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+    bPropRest(data,
+              learning_rate,
+              momentum,
+              L2_reg,
+              parameter_update,
+              conditioning_constant,
+              decay);
   }
 
-	stop_timer(8);
+  // Sparse version (for NCE log-likelihood)
+  template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+  void bProp(const MatrixBase<DerivedIn> &data,
+             const MatrixBase<DerivedOutI> &samples,
+             const MatrixBase<DerivedOutV> &weights,
+             double learning_rate,
+             double momentum,
+             double L2_reg,
+             std::string &parameter_update,
+             double conditioning_constant,
+             double decay)
+  {
 
-	bPropRest(data,
-      learning_rate,
-      momentum,
-      L2_reg,
-      parameter_update,
-      conditioning_constant,
-      decay);
+    // Output embedding layer
+
+    start_timer(7);
+    output_layer_node.param->bProp(samples,
+                                   weights,
+                                   output_layer_node.bProp_matrix);
+    stop_timer(7);
+
+
+    start_timer(8);
+    Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
+    if (parameter_update == "SGD") {
+      output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
+                                               samples,
+                                               weights,
+                                               learning_rate,
+                                               momentum);
+    } else if (parameter_update == "ADA") {
+      output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
+                                                      samples,
+                                                      weights,
+                                                      learning_rate);
+    } else if (parameter_update == "ADAD") {
+      int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+      //std::cerr<<"Adadelta gradient"<<endl;
+      output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
+                                                       samples,
+                                                       weights,
+                                                       1.0/current_minibatch_size,
+                                                       conditioning_constant,
+                                                       decay);
+    } else {
+      std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
     }
 
-private:
-    template <typename DerivedIn>
-    void bPropRest(const MatrixBase<DerivedIn> &data,
-		   double learning_rate, double momentum, double L2_reg,
-       std::string &parameter_update,
-       double conditioning_constant,
-       double decay) 
-    {
-	// Second hidden layer
+    stop_timer(8);
 
+    bPropRest(data,
+              learning_rate,
+              momentum,
+              L2_reg,
+              parameter_update,
+              conditioning_constant,
+              decay);
+  }
 
-  
-  // All the compute gradient functions are together and the backprop
-  // functions are together
-  ////////BACKPROP////////////
-        start_timer(9);
-  if (skip_hidden)
+ private:
+  template <typename DerivedIn>
+  void bPropRest(const MatrixBase<DerivedIn> &data,
+                 double learning_rate, double momentum, double L2_reg,
+                 std::string &parameter_update,
+                 double conditioning_constant,
+                 double decay)
   {
-        start_timer(9);
-        first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+    // Second hidden layer
+
+
+
+    // All the compute gradient functions are together and the backprop
+    // functions are together
+    ////////BACKPROP////////////
+    start_timer(9);
+    if (skip_hidden)
+    {
+      start_timer(9);
+      first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
                                                 first_hidden_activation_node.bProp_matrix,
                                                 first_hidden_linear_node.fProp_matrix,
                                                 first_hidden_activation_node.fProp_matrix);
 
-        first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
-                                                first_hidden_linear_node.bProp_matrix);
-        stop_timer(9);
+      first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+                                            first_hidden_linear_node.bProp_matrix);
+      stop_timer(9);
 
-  }
-  else
-  {
-        second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
-                                           second_hidden_activation_node.bProp_matrix,
-                                           second_hidden_linear_node.fProp_matrix,
-                                           second_hidden_activation_node.fProp_matrix);
+    }
+    else
+    {
+      second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+                                                 second_hidden_activation_node.bProp_matrix,
+                                                 second_hidden_linear_node.fProp_matrix,
+                                                 second_hidden_activation_node.fProp_matrix);
 
 
-	second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
-					       second_hidden_linear_node.bProp_matrix);
-	stop_timer(9);
+      second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
+                                             second_hidden_linear_node.bProp_matrix);
+      stop_timer(9);
 
-	start_timer(11);
-	first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
-						  first_hidden_activation_node.bProp_matrix,
-						  first_hidden_linear_node.fProp_matrix,
-						  first_hidden_activation_node.fProp_matrix);
+      start_timer(11);
+      first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
+                                                first_hidden_activation_node.bProp_matrix,
+                                                first_hidden_linear_node.fProp_matrix,
+                                                first_hidden_activation_node.fProp_matrix);
 
-        first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
-					      first_hidden_linear_node.bProp_matrix);
-	stop_timer(11);
-  }
-  //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
-  //std::getchar();
-  ////COMPUTE GRADIENT/////////
-  if (parameter_update == "SGD") {
-    if (!skip_hidden)
-    {
-    start_timer(10);
-    second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
-                 first_hidden_activation_node.fProp_matrix,
-                 learning_rate,
-                 momentum,
-                 L2_reg);
-    stop_timer(10);
+      first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+                                            first_hidden_linear_node.bProp_matrix);
+      stop_timer(11);
     }
-
-    // First hidden layer
-
-    
-    start_timer(12);
-    first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
-                input_layer_node.fProp_matrix,
-                learning_rate, momentum, L2_reg);
-    stop_timer(12);
-
-    // Input word embeddings
-    
-    start_timer(13);
-    input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
-              data,
-              learning_rate, momentum, L2_reg);
-    stop_timer(13);
-  } else if (parameter_update == "ADA") {
-    if (!skip_hidden)
-    {
-    start_timer(10);
-    second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
-                 first_hidden_activation_node.fProp_matrix,
-                 learning_rate,
-                 L2_reg);
-    stop_timer(10);
+    //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
+    //std::getchar();
+    ////COMPUTE GRADIENT/////////
+    if (parameter_update == "SGD") {
+      if (!skip_hidden)
+      {
+        start_timer(10);
+        second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
+                                                         first_hidden_activation_node.fProp_matrix,
+                                                         learning_rate,
+                                                         momentum,
+                                                         L2_reg);
+        stop_timer(10);
+      }
+
+      // First hidden layer
+
+
+      start_timer(12);
+      first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
+                                                      input_layer_node.fProp_matrix,
+                                                      learning_rate, momentum, L2_reg);
+      stop_timer(12);
+
+      // Input word embeddings
+
+      start_timer(13);
+      input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
+                                              data,
+                                              learning_rate, momentum, L2_reg);
+      stop_timer(13);
+    } else if (parameter_update == "ADA") {
+      if (!skip_hidden)
+      {
+        start_timer(10);
+        second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
+                                                                first_hidden_activation_node.fProp_matrix,
+                                                                learning_rate,
+                                                                L2_reg);
+        stop_timer(10);
+      }
+
+      // First hidden layer
+
+
+      start_timer(12);
+      first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
+                                                             input_layer_node.fProp_matrix,
+                                                             learning_rate,
+                                                             L2_reg);
+      stop_timer(12);
+
+      // Input word embeddings
+
+      start_timer(13);
+      input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
+                                                     data,
+                                                     learning_rate,
+                                                     L2_reg);
+      stop_timer(13);
+    } else if (parameter_update == "ADAD") {
+      int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
+      //std::cerr<<"Adadelta gradient"<<endl;
+      if (!skip_hidden)
+      {
+        start_timer(10);
+        second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
+                                                                 first_hidden_activation_node.fProp_matrix,
+                                                                 1.0/current_minibatch_size,
+                                                                 L2_reg,
+                                                                 conditioning_constant,
+                                                                 decay);
+        stop_timer(10);
+      }
+      //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
+
+      // First hidden layer
+
+
+      start_timer(12);
+      first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
+                                                              input_layer_node.fProp_matrix,
+                                                              1.0/current_minibatch_size,
+                                                              L2_reg,
+                                                              conditioning_constant,
+                                                              decay);
+      stop_timer(12);
+
+      //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
+      // Input word embeddings
+
+      start_timer(13);
+      input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
+                                                      data,
+                                                      1.0/current_minibatch_size,
+                                                      L2_reg,
+                                                      conditioning_constant,
+                                                      decay);
+      stop_timer(13);
+
+      //std::cerr<<"Finished gradient for first input layer"<<std::endl;
+    } else {
+      std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
     }
 
-    // First hidden layer
-
-    
-    start_timer(12);
-    first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
-                input_layer_node.fProp_matrix,
-                learning_rate,
-                L2_reg);
-    stop_timer(12);
-
-    // Input word embeddings
-     
-    start_timer(13);
-    input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
-              data,
-              learning_rate, 
-              L2_reg);
-    stop_timer(13);
-  } else if (parameter_update == "ADAD") {
-    int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
-    //std::cerr<<"Adadelta gradient"<<endl;
-    if (!skip_hidden)
-    {
-    start_timer(10);
-    second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
-                 first_hidden_activation_node.fProp_matrix,
-                 1.0/current_minibatch_size,
-                 L2_reg,
-                 conditioning_constant,
-                 decay);
-    stop_timer(10);
-    }
-    //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
-
-    // First hidden layer
-
-    
-    start_timer(12);
-    first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
-                input_layer_node.fProp_matrix,
-                1.0/current_minibatch_size,
-                L2_reg,
-                conditioning_constant,
-                decay);
-    stop_timer(12);
-
-    //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
-    // Input word embeddings
-     
-    start_timer(13);
-    input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
-              data,
-              1.0/current_minibatch_size, 
-              L2_reg,
-              conditioning_constant,
-              decay);
-    stop_timer(13);
-  
-    //std::cerr<<"Finished gradient for first input layer"<<std::endl;
-  } else {
-    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
   }
-
-    }
 };
 
 } // namespace nplm
 
 #endif
-
diff --git a/src/replace_digits.hpp b/src/replace_digits.hpp
new file mode 100644
index 0000000..e8ac957
--- /dev/null
+++ b/src/replace_digits.hpp
@@ -0,0 +1,62 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+    replace 0-9 ascii chars with another ascii replacement
+
+    To the extent possible under law, the author(s) have dedicated all copyright
+    and related and neighboring rights to this software to the public domain
+    worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef REPLACEDIGITS_GRAEHL_2015_06_25_H
+#define REPLACEDIGITS_GRAEHL_2015_06_25_H
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace graehl {
+
+inline bool ascii_digit(char c) {
+  return c >= '0' && c <= '9';
+}
+
+struct replace_digits {
+  char map_digits;
+  replace_digits(char map_digits = '@') : map_digits(map_digits) {}
+
+  /// \return whether anything was replaced
+  bool replaced(char* i, char* end) const {
+    for (; i != end; ++i)
+      if (ascii_digit(*i)) {
+        *i = map_digits;
+        while (++i != end)
+          if (ascii_digit(*i)) *i = map_digits;
+        return true;
+      }
+    return false;
+  }
+  /// maybe: only if non-0 map_digits, do the thing
+  bool maybe_replaced(char* i, char* end) const { return map_digits && replaced(i, end); }
+
+  void replace(char* i, char* end) const {
+    for (; i != end; ++i)
+      if (ascii_digit(*i)) *i = map_digits;
+  }
+  void maybe_replace(char* i, char* end) const {
+    if (map_digits) replace(i, end);
+  }
+
+  void replace(std::string& str, std::string::size_type i = 0) const {
+    std::string::size_type n = str.size();
+    char* d = (char *)str.data(); // although only C++11 officially allows this, in reality everyone does
+    replace(d + i, d + n);
+  }
+  void maybe_replace(std::string& str, std::string::size_type i = 0) const {
+    if (map_digits) replace(str, i);
+  }
+};
+
+
+}
+
+#endif
diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp
index 4f3713d..abaab34 100644
--- a/src/testNeuralLM.cpp
+++ b/src/testNeuralLM.cpp
@@ -6,7 +6,6 @@
 #include <tclap/CmdLine.h>
 
 #include <Eigen/Core>
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "param.h"
@@ -21,174 +20,174 @@ using namespace Eigen;
 using namespace nplm;
 
 void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams,
-	   vector<double> &out) {
-    if (ngrams.size() == 0) return;
-    int ngram_size = ngrams[0].size();
-
-    if (minibatch_size == 0)
+           vector<double> &out) {
+  if (ngrams.size() == 0) return;
+  int ngram_size = ngrams[0].size();
+
+  if (minibatch_size == 0)
+  {
+    // Score one n-gram at a time. This is how the LM would be queried from a decoder.
+    for (int sent_id=0; sent_id<start.size()-1; sent_id++)
     {
-        // Score one n-gram at a time. This is how the LM would be queried from a decoder.
-        for (int sent_id=0; sent_id<start.size()-1; sent_id++)
-	{	  
-	    double sent_log_prob = 0.0;
-	    for (int j=start[sent_id]; j<start[sent_id+1]; j++) 
-	        sent_log_prob += lm.lookup_ngram(ngrams[j]);
-	    out.push_back(sent_log_prob);
-	}
+      double sent_log_prob = 0.0;
+      for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+        sent_log_prob += lm.lookup_ngram(ngrams[j]);
+      out.push_back(sent_log_prob);
     }
-    else
+  }
+  else
+  {
+    // Score a whole minibatch at a time.
+    Matrix<double,1,Dynamic> log_probs(ngrams.size());
+
+    Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
+    minibatch.setZero();
+    for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
     {
-	// Score a whole minibatch at a time.
-        Matrix<double,1,Dynamic> log_probs(ngrams.size());
-
-        Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
-	minibatch.setZero();
-        for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
-	{
-	    int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
-	    for (int j=0; j<current_minibatch_size; j++)
-	        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
-	    lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
-	}
-
-	for (int sent_id=0; sent_id<start.size()-1; sent_id++)
-	{
-	    double sent_log_prob = 0.0;
-	    for (int j=start[sent_id]; j<start[sent_id+1]; j++)
-	        sent_log_prob += log_probs[j];
-	    out.push_back(sent_log_prob);
-	}
+      int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
+      for (int j=0; j<current_minibatch_size; j++)
+        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
+      lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
     }
+
+    for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+    {
+      double sent_log_prob = 0.0;
+      for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+        sent_log_prob += log_probs[j];
+      out.push_back(sent_log_prob);
+    }
+  }
 }
 
-int main (int argc, char *argv[]) 
+int main (int argc, char *argv[])
 {
-    param myParam;
-    bool normalization;
-    bool numberize, ngramize, add_start_stop;
+  param myParam;
+  bool normalization;
+  bool numberize, ngramize, add_start_stop;
 
-    try {
-      // program options //
-      CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
+  try {
+    // program options //
+    CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
 
-      ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
-      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
+    ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
+    ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
 
-      ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
-      ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
-      ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
 
-      ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+    ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
 
-      ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
+    ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
 
-      ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
+    ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
 
-      cmd.parse(argc, argv);
+    cmd.parse(argc, argv);
 
-      myParam.model_file = arg_model_file.getValue();
-      myParam.test_file = arg_test_file.getValue();
+    myParam.model_file = arg_model_file.getValue();
+    myParam.test_file = arg_test_file.getValue();
 
-      normalization = arg_normalization.getValue();
-      numberize = arg_numberize.getValue();
-      ngramize = arg_ngramize.getValue();
-      add_start_stop = arg_add_start_stop.getValue();
+    normalization = arg_normalization.getValue();
+    numberize = arg_numberize.getValue();
+    ngramize = arg_ngramize.getValue();
+    add_start_stop = arg_add_start_stop.getValue();
 
-      myParam.minibatch_size = minibatch_size.getValue();
-      myParam.num_threads = num_threads.getValue();
+    myParam.minibatch_size = minibatch_size.getValue();
+    myParam.num_threads = num_threads.getValue();
 
-      cerr << "Command line: " << endl;
-      cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-      
-      const string sep(" Value: ");
-      cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
-      cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
+    cerr << "Command line: " << endl;
+    cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
 
-      cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
-      cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
-      cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    const string sep(" Value: ");
+    cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
+    cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
 
-      cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
-      cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
-    }
-    catch (TCLAP::ArgException &e)
-    {
-      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
-      exit(1);
-    }
+    cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
+    cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+    cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
 
-    myParam.num_threads = setup_threads(myParam.num_threads);
+    cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
+    cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
+  }
+  catch (TCLAP::ArgException &e)
+  {
+    cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+    exit(1);
+  }
 
-    ///// Create language model
+  myParam.num_threads = setup_threads(myParam.num_threads);
 
-    neuralLM lm;
-    lm.read(myParam.model_file);
-    lm.set_normalization(normalization);
-    lm.set_log_base(10);
-    lm.set_cache(1048576);
-    int ngram_size = lm.get_order();
-    int minibatch_size = myParam.minibatch_size;
-    if (minibatch_size)
-        lm.set_width(minibatch_size);
+  ///// Create language model
 
-    ///// Read test data
-
-    ifstream test_file(myParam.test_file.c_str());
-    if (!test_file)
-    {
-	cerr << "error: could not open " << myParam.test_file << endl;
-	exit(1);
-    }
-    string line;
+  neuralLM lm;
+  lm.read(myParam.model_file);
+  lm.set_normalization(normalization);
+  lm.set_log_base(10);
+  lm.set_cache(1048576);
+  int ngram_size = lm.get_order();
+  int minibatch_size = myParam.minibatch_size;
+  if (minibatch_size)
+    lm.set_width(minibatch_size);
 
-    vector<int> start;
-    vector<vector<int> > ngrams;
+  ///// Read test data
 
-    while (getline(test_file, line))
-    {
-        vector<string> words;
-        splitBySpace(line, words);
+  ifstream test_file(myParam.test_file.c_str());
+  if (!test_file)
+  {
+    cerr << "error: could not open " << myParam.test_file << endl;
+    exit(1);
+  }
+  string line;
 
-	vector<vector<int> > sent_ngrams;
-	preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
+  vector<int> start;
+  vector<vector<int> > ngrams;
 
-	start.push_back(ngrams.size());
-	copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
-    }
-    start.push_back(ngrams.size());
+  while (getline(test_file, line))
+  {
+    vector<string> words;
+    splitBySpace(line, words);
 
-    int num_threads = 1;
-    vector< vector<double> > sent_log_probs(num_threads);
+    vector<vector<int> > sent_ngrams;
+    preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
 
-    /*
-    // Test thread safety
-    boost::thread_group tg;
-    for (int t=0; t < num_threads; t++) {
-      tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
-    }
-    tg.join_all();
-    */
-    score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
-
-    vector<double> log_likelihood(num_threads);
-    std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
-    for (int i=0; i<sent_log_probs[0].size(); i++) {
-        for (int t=0; t<num_threads; t++)
-	    cout << sent_log_probs[t][i] << "\t";
-	cout << endl;
-        for (int t=0; t<num_threads; t++)
-	log_likelihood[t] += sent_log_probs[t][i];
-    }
-    
-    cerr << "Test log10-likelihood: ";
+    start.push_back(ngrams.size());
+    copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
+  }
+  start.push_back(ngrams.size());
+
+  int num_threads = 1;
+  vector< vector<double> > sent_log_probs(num_threads);
+
+  /*
+  // Test thread safety
+  boost::thread_group tg;
+  for (int t=0; t < num_threads; t++) {
+  tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
+  }
+  tg.join_all();
+  */
+  score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
+
+  vector<double> log_likelihood(num_threads);
+  std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
+  for (int i=0; i<sent_log_probs[0].size(); i++) {
     for (int t=0; t<num_threads; t++)
-      cerr << log_likelihood[t] << " ";
-    cerr << endl;
-    #ifdef USE_CHRONO
-    cerr << "Propagation times:";
-    for (int i=0; i<timer.size(); i++)
-      cerr << " " << timer.get(i);
-    cerr << endl;
-    #endif
-    
+      cout << sent_log_probs[t][i] << "\t";
+    cout << endl;
+    for (int t=0; t<num_threads; t++)
+      log_likelihood[t] += sent_log_probs[t][i];
+  }
+
+  cerr << "Test log10-likelihood: ";
+  for (int t=0; t<num_threads; t++)
+    cerr << log_likelihood[t] << " ";
+  cerr << endl;
+#ifdef USE_CHRONO
+  cerr << "Propagation times:";
+  for (int i=0; i<timer.size(); i++)
+    cerr << " " << timer.get(i);
+  cerr << endl;
+#endif
+
 }
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
index 97af03b..d4720ef 100644
--- a/src/trainNeuralNetwork.cpp
+++ b/src/trainNeuralNetwork.cpp
@@ -6,17 +6,16 @@
 #include <vector>
 #include <algorithm>
 
-#include <boost/unordered_map.hpp> 
+#include <boost/unordered_map.hpp>
 #include <boost/functional.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
 #include <boost/interprocess/containers/vector.hpp>
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 #include <Eigen/Sparse>
 #include "maybe_omp.h"
@@ -29,7 +28,6 @@
 #include "graphClasses.h"
 #include "util.h"
 #include "multinomial.h"
-//#include "gradientCheck.h"
 
 //#define EIGEN_DONT_PARALLELIZE
 
@@ -65,7 +63,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
         int validation_minibatch_start_index = validation_minibatch_size * validation_batch;
         int current_minibatch_size = min(validation_minibatch_size,
                                           validation_data_size - validation_minibatch_start_index);
-        minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, 
+        minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index,
                                                                                 current_minibatch_size);
         prop_validation.fProp(minibatch.topRows(ngram_size-1));
 
@@ -80,7 +78,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
         // And softmax and loss. Be careful of short minibatch
         double minibatch_log_likelihood;
         start_timer(5);
-        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
+        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
                                 minibatch.row(ngram_size-1),
                                 output_probs,
                                 minibatch_log_likelihood);
@@ -93,7 +91,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
 
     // If the validation perplexity decreases, halve the learning rate.
     if (current_validation_ll != 0.0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA")
-    { 
+    {
         current_learning_rate /= 2;
     }
     current_validation_ll = log_likelihood;
@@ -101,7 +99,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
 
 
 int main(int argc, char** argv)
-{ 
+{
     ios::sync_with_stdio(false);
     bool use_mmap_file, randomize;
     param myParam;
@@ -183,7 +181,7 @@ int main(int argc, char** argv)
       myParam.input_words_file = input_words_file.getValue();
       myParam.output_words_file = output_words_file.getValue();
       if (words_file.getValue() != "")
-	      myParam.input_words_file = myParam.output_words_file = words_file.getValue();
+        myParam.input_words_file = myParam.output_words_file = words_file.getValue();
 
       myParam.model_prefix = model_prefix.getValue();
 
@@ -192,7 +190,7 @@ int main(int argc, char** argv)
       myParam.input_vocab_size = input_vocab_size.getValue();
       myParam.output_vocab_size = output_vocab_size.getValue();
       if (vocab_size.getValue() > 0) {
-	      myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
+        myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
       }
       myParam.num_hidden = num_hidden.getValue();
       myParam.activation_function = activation_function.getValue();
@@ -205,7 +203,7 @@ int main(int argc, char** argv)
       myParam.input_embedding_dimension = input_embedding_dimension.getValue();
       myParam.output_embedding_dimension = output_embedding_dimension.getValue();
       if (embedding_dimension.getValue() >= 0) {
-	      myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
+        myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
       }
 
       myParam.minibatch_size = minibatch_size.getValue();
@@ -243,33 +241,33 @@ int main(int argc, char** argv)
 
       if (embedding_dimension.getValue() >= 0)
       {
-	      cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
+        cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
       }
       else
       {
-	      cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
-	      cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
+        cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
+        cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
       }
       cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl;
       if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue())
       {
-	      cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
-	      exit(1);
+        cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
+        exit(1);
       }
 
       cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl;
 
       if (string_to_activation_function(activation_function.getValue()) == InvalidFunction)
       {
-	      cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
-	      exit(1);
+        cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
+        exit(1);
       }
       cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl;
 
       if (string_to_loss_function(loss_function.getValue()) == InvalidLoss)
       {
-	      cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
-	      exit(1);
+        cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
+        exit(1);
       }
       cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl;
 
@@ -279,7 +277,7 @@ int main(int argc, char** argv)
       cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl;
       cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
       if (myParam.validation_file != "") {
-	     cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
+       cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
       }
       cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl;
       cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl;
@@ -288,7 +286,7 @@ int main(int argc, char** argv)
 
       cerr << normalization.getDescription() << sep << normalization.getValue() << endl;
       if (myParam.normalization){
-	      cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
+        cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
       }
 
       cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl;
@@ -302,7 +300,7 @@ int main(int argc, char** argv)
 
       if (unigram_probs_file.getValue() != "")
       {
-	      cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
+        cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
       }
     }
     catch (TCLAP::ArgException &e)
@@ -316,7 +314,7 @@ int main(int argc, char** argv)
 
     //unsigned seed = std::time(0);
     unsigned seed = 1234; //for testing only
-    mt19937 rng(seed);
+    boost::random::mt19937 rng(seed);
 
     /////////////////////////READING IN THE TRAINING AND VALIDATION DATA///////////////////
     /////////////////////////////////////////////////////////////////////////////////////
@@ -337,7 +335,7 @@ int main(int argc, char** argv)
       training_data_flat_mmap = mmap_file.find<vec>("vector").first;
       cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl;
       training_data_size = training_data_flat_mmap->size()/myParam.ngram_size;
-      //randomly shuffle the data for better learning. The shuffling will 
+      //randomly shuffle the data for better learning. The shuffling will
       //be different for a standard stl vector
       // Randomly shuffle training data to improve learning
       if (randomize == true) {
@@ -413,10 +411,10 @@ int main(int argc, char** argv)
     //cerr<<"Num tokens "<<num_tokens<<endl;
     //data_size_t training_data_size = num_tokens / myParam.ngram_size;
     cerr << "Number of training instances: "<< training_data_size << endl;
-    
+
     Matrix<int,Dynamic,Dynamic> training_data;
     //(training_data_flat.data(), myParam.ngram_size, training_data_size);
-    
+
     #ifdef MAP
     cerr<<"Setting up eigen map"<<endl;
     if (use_mmap_file == false) {
@@ -425,11 +423,11 @@ int main(int argc, char** argv)
       training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size);
     }
     cerr<<"Created eigen map"<<endl;
-    #else 
+    #else
     if (use_mmap_file == false) {
       training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size);
     }
-    #endif 
+    #endif
     // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index
     if (myParam.input_vocab_size == 0 and myParam.input_words_file == "")
     {
@@ -454,7 +452,7 @@ int main(int argc, char** argv)
     // Read validation data
     vector<int> validation_data_flat;
     int validation_data_size = 0;
-    
+
     if (myParam.validation_file != "")
     {
       readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat);
@@ -470,16 +468,16 @@ int main(int argc, char** argv)
     if (myParam.input_words_file != "")
     {
         readWordsFile(myParam.input_words_file, input_words);
-	if (myParam.input_vocab_size == 0)
-	    myParam.input_vocab_size = input_words.size();
+  if (myParam.input_vocab_size == 0)
+      myParam.input_vocab_size = input_words.size();
     }
 
     vector<string> output_words;
     if (myParam.output_words_file != "")
     {
         readWordsFile(myParam.output_words_file, output_words);
-	if (myParam.output_vocab_size == 0)
-	    myParam.output_vocab_size = output_words.size();
+  if (myParam.output_vocab_size == 0)
+      myParam.output_vocab_size = output_words.size();
     }
 
     ///// Construct unigram model and sampler that will be used for NCE
@@ -491,17 +489,17 @@ int main(int argc, char** argv)
         if (use_mmap_file == false) {
           output_word = training_data(myParam.ngram_size-1, train_id);
         } else {
-	      //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
+        //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
           output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1);
         }
-		//cerr<<"output word is "<<output_word<<endl;
-	    unigram_counts[output_word] += 1;
+    //cerr<<"output word is "<<output_word<<endl;
+      unigram_counts[output_word] += 1;
     }
     multinomial<data_size_t> unigram (unigram_counts);
 
     ///// Create and initialize the neural network and associated propagators.
     model nn;
-    // IF THE MODEL FILE HAS BEEN DEFINED, THEN 
+    // IF THE MODEL FILE HAS BEEN DEFINED, THEN
     // LOAD THE NEURAL NETWORK MODEL
     if (myParam.model_file != ""){
       nn.read(myParam.model_file);
@@ -529,7 +527,7 @@ int main(int argc, char** argv)
     SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram);
     // normalization parameters
     vector_map c_h, c_h_running_gradient;
-    
+
     ///////////////////////TRAINING THE NEURAL NETWORK////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////
 
@@ -540,8 +538,8 @@ int main(int argc, char** argv)
     if (validation_data_size > 0)
     {
         num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1;
-	cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
-    } 
+  cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
+    }
 
     double current_momentum = myParam.initial_momentum;
     double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1);
@@ -568,36 +566,36 @@ int main(int argc, char** argv)
     }
 
     for (int epoch=0; epoch<myParam.num_epochs; epoch++)
-    { 
+    {
         cerr << "Epoch " << epoch+1 << endl;
         cerr << "Current learning rate: " << current_learning_rate << endl;
 
-        if (myParam.use_momentum) 
-	    cerr << "Current momentum: " << current_momentum << endl;
-	else
+        if (myParam.use_momentum)
+      cerr << "Current momentum: " << current_momentum << endl;
+  else
             current_momentum = -1;
 
-	cerr << "Training minibatches: ";
+  cerr << "Training minibatches: ";
 
-	double log_likelihood = 0.0;
+  double log_likelihood = 0.0;
 
-	int num_samples = 0;
-	if (loss_function == LogLoss)
-	    num_samples = output_vocab_size;
-	else if (loss_function == NCELoss)
-	    num_samples = 1+num_noise_samples;
+  int num_samples = 0;
+  if (loss_function == LogLoss)
+      num_samples = output_vocab_size;
+  else if (loss_function == NCELoss)
+      num_samples = 1+num_noise_samples;
 
-	Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
-	Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
-	Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
-	Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
+  Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
+  Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
+  Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
+  Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
 
         for(data_size_t batch=0;batch<num_batches;batch++)
         {
             if (batch > 0 && batch % 10000 == 0)
             {
-	        cerr << batch <<"...";
-            } 
+          cerr << batch <<"...";
+            }
 
             if (batch > 0 && batch % 500000 == 0)
             {
@@ -605,31 +603,31 @@ int main(int argc, char** argv)
                 compute_validation_perplexity(ngram_size, output_vocab_size, validation_minibatch_size, validation_data_size, num_validation_batches, myParam, prop_validation, validation_data, current_learning_rate, current_validation_ll);
                 cerr << "Current learning rate: " << current_learning_rate << endl;
             }
-   
+
             data_size_t minibatch_start_index = minibatch_size * batch;
 
       int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index);
       #ifdef MAP
-	    Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
-      #else 
+      Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+      #else
       //ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file
-	    Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
-		//cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
-		//cerr<<"Minibatch size "<<current_minibatch_size<<endl;
+      Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+    //cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
+    //cerr<<"Minibatch size "<<current_minibatch_size<<endl;
             if (use_mmap_file == true) {
             minibatch.setZero(ngram_size,current_minibatch_size);
             //now reading the ngrams from the mmaped file
               for (int k=0; k<ngram_size; k++){
                 for (data_size_t index = 0 ; index<current_minibatch_size; index++) {
-				  data_size_t current_index = index + minibatch_start_index;
-				  //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
+          data_size_t current_index = index + minibatch_start_index;
+          //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
                   minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k);
                 }
               }
             } else {
               minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
             }
-      #endif 
+      #endif
             double adjusted_learning_rate = current_learning_rate/minibatch_size;
             //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl;
 
@@ -648,20 +646,20 @@ int main(int argc, char** argv)
 
             prop.fProp(minibatch.topRows(ngram_size-1));
 
-	    if (loss_function == NCELoss)
-	    {
-	      ///// Noise-contrastive estimation
+      if (loss_function == NCELoss)
+      {
+        ///// Noise-contrastive estimation
 
-	      // Generate noise samples. Gather positive and negative samples into matrix.
+        // Generate noise samples. Gather positive and negative samples into matrix.
 
-	      start_timer(3);
+        start_timer(3);
 
         minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1);
-        
+
         for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++)
             for (int train_id = 0; train_id < current_minibatch_size; train_id++)
                 minibatch_samples(sample_id, train_id) = unigram.sample(rng);
-          
+
         stop_timer(3);
 
         // Final forward propagation step (sparse)
@@ -686,7 +684,7 @@ int main(int argc, char** argv)
 
         double minibatch_log_likelihood;
         start_timer(5);
-        softmax_loss.fProp(scores.leftCols(current_minibatch_size), 
+        softmax_loss.fProp(scores.leftCols(current_minibatch_size),
                minibatch_samples,
                probs, minibatch_log_likelihood);
         stop_timer(5);
@@ -697,9 +695,9 @@ int main(int argc, char** argv)
         start_timer(6);
         softmax_loss.bProp(probs, minibatch_weights);
         stop_timer(6);
-        
+
         // Update the normalization parameters
-        
+
         if (myParam.normalization)
         {
           for (int train_id = 0;train_id < current_minibatch_size;train_id++)
@@ -711,19 +709,19 @@ int main(int argc, char** argv)
 
         // Be careful of short minibatch
         prop.bProp(minibatch.topRows(ngram_size-1),
-             minibatch_samples.leftCols(current_minibatch_size), 
+             minibatch_samples.leftCols(current_minibatch_size),
              minibatch_weights.leftCols(current_minibatch_size),
-             adjusted_learning_rate, 
+             adjusted_learning_rate,
              current_momentum,
              myParam.L2_reg,
              myParam.parameter_update,
              myParam.conditioning_constant,
              myParam.decay);
-	    }
-	    else if (loss_function == LogLoss)
-	    {
-	      ///// Standard log-likelihood
-	      start_timer(4);
+      }
+      else if (loss_function == LogLoss)
+      {
+        ///// Standard log-likelihood
+        start_timer(4);
         if (prop.skip_hidden)
             prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
         else
@@ -732,21 +730,21 @@ int main(int argc, char** argv)
 
         double minibatch_log_likelihood;
         start_timer(5);
-        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
-                   minibatch.row(ngram_size-1), 
-                   probs, 
+        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
+                   minibatch.row(ngram_size-1),
+                   probs,
                    minibatch_log_likelihood);
         stop_timer(5);
         log_likelihood += minibatch_log_likelihood;
 
         ///// Backward propagation
-        
+
         start_timer(6);
-        SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), 
-                   probs.leftCols(current_minibatch_size), 
+        SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size),
+                   probs.leftCols(current_minibatch_size),
                    minibatch_weights);
         stop_timer(6);
-        
+
         prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size),
              minibatch_weights,
              adjusted_learning_rate,
@@ -757,33 +755,33 @@ int main(int argc, char** argv)
              myParam.decay);
           }
       }
-	cerr << "done." << endl;
+  cerr << "done." << endl;
 
-	if (loss_function == LogLoss)
-	{
-	    cerr << "Training log-likelihood: " << log_likelihood << endl;
+  if (loss_function == LogLoss)
+  {
+      cerr << "Training log-likelihood: " << log_likelihood << endl;
             cerr << "         perplexity:     "<< exp(-log_likelihood/training_data_size) << endl;
-	}
-	else if (loss_function == NCELoss)
-	    cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
+  }
+  else if (loss_function == NCELoss)
+      cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
 
         current_momentum += momentum_delta;
 
-	#ifdef USE_CHRONO
-	cerr << "Propagation times:";
-	for (int i=0; i<timer.size(); i++)
-	  cerr << " " << timer.get(i);
-	cerr << endl;
-	#endif
-
-	if (myParam.model_prefix != "")
-	{
-	    cerr << "Writing model" << endl;
-	    if (myParam.input_words_file != "")
-	        nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
-	    else
-	        nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
-	}
+  #ifdef USE_CHRONO
+  cerr << "Propagation times:";
+  for (int i=0; i<timer.size(); i++)
+    cerr << " " << timer.get(i);
+  cerr << endl;
+  #endif
+
+  if (myParam.model_prefix != "")
+  {
+      cerr << "Writing model" << endl;
+      if (myParam.input_words_file != "")
+          nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
+      else
+          nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
+  }
 
         if (epoch % 1 == 0 && validation_data_size > 0)
         {
@@ -793,4 +791,3 @@ int main(int argc, char** argv)
     }
     return 0;
 }
-
diff --git a/src/types.hpp b/src/types.hpp
deleted file mode 100644
index 08b010f..0000000
--- a/src/types.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef TYPES_HPP
-#define TYPES_HPP
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <boost/cstdint.hpp>
-#include <limits>
-
-namespace biglm{
-
-typedef double weight_type;
-const weight_type IMPOSSIBLE = -HUGE_VAL;
-
-typedef unsigned long block_type;
-const size_t bits_per_block = (std::numeric_limits<block_type>::digits);
-  //typedef std::size_t size_type;
-typedef boost::uint64_t size_type;
-typedef unsigned char byte_type;
-
-template<typename T>
-struct bytes {
-  static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); }
-  static size_type size(const T& key) { return sizeof(T); }
-};
-
-template<>
-struct bytes<std::string> {
-  static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); }
-  static size_type size(const std::string& key) { return key.size(); }
-};
-
-template<typename U>
-struct bytes<std::vector<U> > {
-  static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); }
-  static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); }
-};
-
-} //namespace nplm
-
-#endif
diff --git a/src/util.h b/src/util.h
index a8453aa..6cbde9d 100644
--- a/src/util.h
+++ b/src/util.h
@@ -15,7 +15,6 @@
 #include <boost/chrono.hpp>
 #endif
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "maybe_omp.h"
@@ -23,15 +22,15 @@
 // Make matrices hashable
 
 namespace Eigen {
-    template <typename Derived>
-    size_t hash_value(const DenseBase<Derived> &m)
-    {
-        size_t h=0;
-	for (int i=0; i<m.rows(); i++)
-	    for (int j=0; j<m.cols(); j++)
-	        boost::hash_combine(h, m(i,j));
-	return h;
-    }
+template <typename Derived>
+size_t hash_value(const DenseBase<Derived> &m)
+{
+  size_t h=0;
+  for (int i=0; i<m.rows(); i++)
+    for (int j=0; j<m.cols(); j++)
+      boost::hash_combine(h, m(i,j));
+  return h;
+}
 }
 
 namespace nplm
@@ -73,9 +72,9 @@ void readSentFile(const std::string &file, T &sentences)
 }
 
 inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){
-        int ngram_size = ngram.size();
-        for (int i=0;i<ngram_size;i++)
-        int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
+  int ngram_size = ngram.size();
+  for (int i=0;i<ngram_size;i++)
+    int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
 }
 
 // Functions that take non-const matrices as arguments
@@ -85,194 +84,194 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra
 
 template <typename Derived>
 void initMatrix(boost::random::mt19937 &engine,
-		const Eigen::MatrixBase<Derived> &p_const,
-		bool init_normal, double range)
+                const Eigen::MatrixBase<Derived> &p_const,
+                bool init_normal, double range)
 {
-    UNCONST(Derived, p_const, p);
-    if (init_normal == 0)
-     // initialize with uniform distribution in [-range, range]
+  UNCONST(Derived, p_const, p);
+  if (init_normal == 0)
+    // initialize with uniform distribution in [-range, range]
+  {
+    boost::random::uniform_real_distribution<> unif_real(-range, range);
+    for (int i = 0; i < p.rows(); i++)
     {
-        boost::random::uniform_real_distribution<> unif_real(-range, range); 
-        for (int i = 0; i < p.rows(); i++)
-        {
-            for (int j = 0; j< p.cols(); j++)
-            {
-                p(i,j) = unif_real(engine);    
-            }
-        }
-
+      for (int j = 0; j< p.cols(); j++)
+      {
+        p(i,j) = unif_real(engine);
+      }
     }
-    else 
-      // initialize with gaussian distribution with mean 0 and stdev range
+
+  }
+  else
+    // initialize with gaussian distribution with mean 0 and stdev range
+  {
+    boost::random::normal_distribution<double> unif_normal(0., range);
+    for (int i = 0; i < p.rows(); i++)
     {
-        boost::random::normal_distribution<double> unif_normal(0., range);
-        for (int i = 0; i < p.rows(); i++)
-        {
-            for (int j = 0; j < p.cols(); j++)
-            {
-                p(i,j) = unif_normal(engine);    
-            }
-        }
+      for (int j = 0; j < p.cols(); j++)
+      {
+        p(i,j) = unif_normal(engine);
+      }
     }
+  }
 }
 
 template <typename Derived>
 void initBias(boost::random::mt19937 &engine,
-		const Eigen::MatrixBase<Derived> &p_const,
-		bool init_normal, double range)
+              const Eigen::MatrixBase<Derived> &p_const,
+              bool init_normal, double range)
 {
-    UNCONST(Derived, p_const, p);
-    if (init_normal == 0)
-     // initialize with uniform distribution in [-range, range]
+  UNCONST(Derived, p_const, p);
+  if (init_normal == 0)
+    // initialize with uniform distribution in [-range, range]
+  {
+    boost::random::uniform_real_distribution<> unif_real(-range, range);
+    for (int i = 0; i < p.size(); i++)
     {
-        boost::random::uniform_real_distribution<> unif_real(-range, range); 
-        for (int i = 0; i < p.size(); i++)
-        {
-            p(i) = unif_real(engine);    
-        }
-
+      p(i) = unif_real(engine);
     }
-    else 
-      // initialize with gaussian distribution with mean 0 and stdev range
+
+  }
+  else
+    // initialize with gaussian distribution with mean 0 and stdev range
+  {
+    boost::random::normal_distribution<double> unif_normal(0., range);
+    for (int i = 0; i < p.size(); i++)
     {
-        boost::random::normal_distribution<double> unif_normal(0., range);
-        for (int i = 0; i < p.size(); i++)
-        {
-            p(i) = unif_normal(engine);    
-        }
+      p(i) = unif_normal(engine);
     }
+  }
 }
 
 
 template <typename Derived>
 void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> &param_const)
 {
-    UNCONST(Derived, param_const, param);
+  UNCONST(Derived, param_const, param);
+
+  int i = 0;
+  std::string line;
+  std::vector<std::string> fields;
+
+  while (std::getline(TRAININ, line) && line != "")
+  {
+    splitBySpace(line, fields);
+    if (fields.size() != param.cols())
+    {
+      std::ostringstream err;
+      err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
+      throw std::runtime_error(err.str());
+    }
 
-    int i = 0;
-    std::string line;
-    std::vector<std::string> fields;
-    
-    while (std::getline(TRAININ, line) && line != "")
+    if (i >= param.rows())
     {
-        splitBySpace(line, fields);
-	if (fields.size() != param.cols())
-	{
-	    std::ostringstream err;
-	    err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
-	    throw std::runtime_error(err.str());
-	}
-	
-	if (i >= param.rows())
-	{
-	    std::ostringstream err;
-	    err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
-	    throw std::runtime_error(err.str());
-	}
-	
-	for (int j=0; j<fields.size(); j++)
-	{
-	    param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
-	}
-	i++;
+      std::ostringstream err;
+      err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
+      throw std::runtime_error(err.str());
     }
-    
-    if (i != param.rows())
+
+    for (int j=0; j<fields.size(); j++)
     {
-        std::ostringstream err;
-	err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
-	throw std::runtime_error(err.str());
+      param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
     }
+    i++;
+  }
+
+  if (i != param.rows())
+  {
+    std::ostringstream err;
+    err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
+    throw std::runtime_error(err.str());
+  }
 }
 
 template <typename Derived>
 void readMatrix(const std::string &param_file, const Eigen::MatrixBase<Derived> &param_const)
 {
-    UNCONST(Derived, param_const, param);
-    std::cerr << "Reading data from file: " << param_file << std::endl;
-    
-    std::ifstream TRAININ(param_file.c_str());
-    if (!TRAININ)
-    {
-        std::cerr << "Error: can't read training data from file " << param_file << std::endl;
-	exit(-1);
-    }
-    readMatrix(TRAININ, param);
-    TRAININ.close();
+  UNCONST(Derived, param_const, param);
+  std::cerr << "Reading data from file: " << param_file << std::endl;
+
+  std::ifstream TRAININ(param_file.c_str());
+  if (!TRAININ)
+  {
+    std::cerr << "Error: can't read training data from file " << param_file << std::endl;
+    exit(-1);
+  }
+  readMatrix(TRAININ, param);
+  TRAININ.close();
 }
 
 template <typename Derived>
 void writeMatrix(const Eigen::MatrixBase<Derived> &param, const std::string &filename)
 {
-    std::cerr << "Writing parameters to " << filename << std::endl;
+  std::cerr << "Writing parameters to " << filename << std::endl;
 
-    std::ofstream OUT;
-    OUT.precision(16);
-    OUT.open(filename.c_str());
-    if (! OUT)
-    {
-      std::cerr << "Error: can't write to file " << filename<< std::endl;
-      exit(-1);
-    }
-    writeMatrix(param, OUT);
-    OUT.close();
+  std::ofstream OUT;
+  OUT.precision(16);
+  OUT.open(filename.c_str());
+  if (! OUT)
+  {
+    std::cerr << "Error: can't write to file " << filename<< std::endl;
+    exit(-1);
+  }
+  writeMatrix(param, OUT);
+  OUT.close();
 }
 
 template <typename Derived>
 void writeMatrix(const Eigen::MatrixBase<Derived> &param, std::ofstream &OUT)
 {
-    for (int row = 0;row < param.rows();row++)
+  for (int row = 0;row < param.rows();row++)
+  {
+    int col;
+    for (col = 0;col < param.cols()-1;col++)
     {
-        int col;
-        for (col = 0;col < param.cols()-1;col++)
-        {
-            OUT<<param(row,col)<<"\t";
-        }
-        //dont want an extra tab at the end
-        OUT<<param(row,col)<<std::endl;
+      OUT<<param(row,col)<<"\t";
     }
+    //dont want an extra tab at the end
+    OUT<<param(row,col)<<std::endl;
+  }
 }
 
 template <typename Derived>
 double logsum(const Eigen::MatrixBase<Derived> &v)
 {
-    int mi; 
-    double m = v.maxCoeff(&mi);
-    double logz = 0.0;
-    for (int i=0; i<v.rows(); i++)
-        if (i != mi)
-	    logz += std::exp(v(i) - m);
-    logz = log1p(logz) + m;
-    return logz;
+  int mi;
+  double m = v.maxCoeff(&mi);
+  double logz = 0.0;
+  for (int i=0; i<v.rows(); i++)
+    if (i != mi)
+      logz += std::exp(v(i) - m);
+  logz = log1p(logz) + m;
+  return logz;
 }
 
 double logadd(double x, double y);
 
 #ifdef USE_CHRONO
-class Timer 
+class Timer
 {
-    typedef boost::chrono::high_resolution_clock clock_type;
-    typedef clock_type::time_point time_type;
-    typedef clock_type::duration duration_type;
-    std::vector<time_type> m_start;
-    std::vector<duration_type> m_total;
-public:
-    Timer() { }
-    Timer(int n) { resize(n); }
-    void resize(int n) { m_start.resize(n); m_total.resize(n); }
-    int size() const { return m_start.size(); }
-    void start(int i);
-    void stop(int i);
-    void reset(int i);
-    double get(int i) const;
+  typedef boost::chrono::high_resolution_clock clock_type;
+  typedef clock_type::time_point time_type;
+  typedef clock_type::duration duration_type;
+  std::vector<time_type> m_start;
+  std::vector<duration_type> m_total;
+ public:
+  Timer() { }
+  Timer(int n) { resize(n); }
+  void resize(int n) { m_start.resize(n); m_total.resize(n); }
+  int size() const { return m_start.size(); }
+  void start(int i);
+  void stop(int i);
+  void reset(int i);
+  double get(int i) const;
 };
 
 extern Timer timer;
 #define start_timer(x) timer.start(x)
 #define stop_timer(x) timer.stop(x)
 #else
-#define start_timer(x) 0
-#define stop_timer(x) 0
+#define start_timer(x) (void)0
+#define stop_timer(x) (void)0
 #endif
 
 int setup_threads(int n_threads);
diff --git a/src/vocabulary.h b/src/vocabulary.h
index a987522..c8cd518 100644
--- a/src/vocabulary.h
+++ b/src/vocabulary.h
@@ -5,6 +5,9 @@
 #include <string>
 #include <queue>
 #include <boost/unordered_map.hpp>
+#include "find_string.hpp"
+
+#define NPLM_HAVE_FIND_STRING_PIECE 1
 
 namespace nplm
 {
@@ -16,80 +19,83 @@ struct compare_second
 };
 
 class vocabulary {
-    std::vector<std::string> m_words;
-    boost::unordered_map<std::string, int> m_index;
-    int unk;
-
-public:
-    vocabulary() 
-    { 
-        unk = insert_word("<unk>");
-    }
-
-    vocabulary(const std::vector<std::string> &words)
+  std::vector<std::string> m_words;
+  typedef boost::unordered_map<std::string, int> WordId;
+  WordId m_index;
+  int unk;
+
+ public:
+  vocabulary()
+  {
+    unk = insert_word("<unk>");
+  }
+
+  vocabulary(const std::vector<std::string> &words)
       :
       m_words(words)
+  {
+    for (int i=0; i<words.size(); i++)
+      m_index[words[i]] = i;
+    unk = m_index["<unk>"];
+  }
+
+  int lookup_word(const std::string &word) const
+  {
+    return lookup_word(word, unk);
+  }
+
+  // lookup word using custom unknown-word id
+  int lookup_word(const std::string &word, int unkid) const
+  {
+    WordId::const_iterator pos = m_index.find(word);
+    return pos == m_index.end() ? unkid : pos->second;
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice) const {
+    return lookup_word(slice, unk);
+  }
+
+  int lookup_word(std::pair<char const*, char const*> slice, int unkid) const
+  {
+    WordId::const_iterator pos = find_string(m_index, slice);
+    return pos == m_index.end() ? unkid : pos->second;
+  }
+
+  int insert_word(const std::string &word)
+  {
+    int i = size();
+    bool inserted = m_index.insert(make_pair(word, i)).second;
+    if (inserted)
     {
-        for (int i=0; i<words.size(); i++)
-            m_index[words[i]] = i;
-	unk = m_index["<unk>"];
-    }
-
-    int lookup_word(const std::string &word) const
-    {
-        boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
-	if (pos != m_index.end())
-	    return pos->second;
-	else
-	    return unk;
+      m_words.push_back(word);
     }
+    return i;
+  }
 
-    // lookup word using custom unknown-word id
-    int lookup_word(const std::string &word, int unk) const
-    {
-        boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
-	if (pos != m_index.end())
-	    return pos->second;
-	else
-	    return unk;
-    }
+  int size() const { return m_words.size(); }
 
-    int insert_word(const std::string &word)
-    {
-        int i = size();
-        bool inserted = m_index.insert(make_pair(word, i)).second;
-	if (inserted)
-	{
-	    m_words.push_back(word);
-	}
-	return i;
-    }
+  // Inserts the most-frequent words from counts until vocab_size words are reached.
+  // counts is a collection of pair<string,int>
+  template <typename Map>
+  int insert_most_frequent(const Map &counts, int vocab_size)
+  {
+    typedef std::pair<std::string,int> stringint;
 
-    int size() const { return m_words.size(); }
+    std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> >
+        q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end()));
 
-    // Inserts the most-frequent words from counts until vocab_size words are reached.
-    // counts is a collection of pair<string,int>
-    template <typename Map>
-    int insert_most_frequent(const Map &counts, int vocab_size)
+    int inserted = 0;
+    while (size() < vocab_size && !q.empty())
     {
-        typedef std::pair<std::string,int> stringint;
-
-	std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> > 
-	  q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end()));
-
-	int inserted = 0;
-	while (size() < vocab_size && !q.empty())
-	{
-	    insert_word(q.top().first);
-	    q.pop();
-	    inserted++;
-	}
-	return inserted;
+      insert_word(q.top().first);
+      q.pop();
+      inserted++;
     }
+    return inserted;
+  }
 
-    const std::vector<std::string> &words() const { return m_words; }
+  const std::vector<std::string> &words() const { return m_words; }
 
-    const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; }
 };
 
 } // namespace nplm
author	Rico Sennrich <rico.sennrich@gmx.ch>	2015-07-17 21:39:42 +0300
committer	Rico Sennrich <rico.sennrich@gmx.ch>	2015-07-17 21:39:42 +0300
commit	a7da1b618082964152054b00c142e5962e4ca692 (patch)
tree	45872fc848d3729e8632af0ffdc431726e39e7a2
parent	28bdadf328c63ee086e8aa5de23cfe0c11728c5b (diff)
parent	c461c4ad7232274dab8405b736bb1ac55cc7874d (diff)