fix mixed tab2/tab4/spaces indents

author: graehl <graehl@gmail.com> 2015-06-25 09:22:21 +0300
committer: graehl <graehl@gmail.com> 2015-06-25 09:25:32 +0300
commit: 37e397f526fc207dea498356e890ad085a733ae8 (patch)
tree: cfea74b92cc4d38aaff06a26c76fdba7594abd69
parent: 50308d573b90ff2814bd346210fc6929bd9b40af (diff)
12 files changed, 2012 insertions, 2027 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h
index 138f9da..742c2fc 100644
--- a/src/Activation_function.h
+++ b/src/Activation_function.h
@@ -3,7 +3,6 @@
 
 #include <cmath>
 #include <string>
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "util.h"
@@ -19,28 +18,28 @@ enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunc
 
 inline activation_function_type string_to_activation_function (const std::string &s)
 {
-    if (s == "identity")
-        return Identity;
-    else if (s == "rectifier")
-        return Rectifier;
-    else if (s == "tanh")
-        return Tanh;
-    else if (s == "hardtanh")
-        return HardTanh;
-    else
-        return InvalidFunction;
+  if (s == "identity")
+    return Identity;
+  else if (s == "rectifier")
+    return Rectifier;
+  else if (s == "tanh")
+    return Tanh;
+  else if (s == "hardtanh")
+    return HardTanh;
+  else
+    return InvalidFunction;
 }
 
 inline std::string activation_function_to_string (activation_function_type f)
 {
-    if (f == Identity)
-        return "identity";
-    else if (f == Rectifier)
-        return "rectifier";
-    else if (f == Tanh)
-        return "tanh";
-    else if (f == HardTanh)
-        return "hardtanh";
+  if (f == Identity)
+    return "identity";
+  else if (f == Rectifier)
+    return "rectifier";
+  else if (f == Tanh)
+    return "tanh";
+  else if (f == HardTanh)
+    return "hardtanh";
 }
 
 struct hardtanh_functor {
@@ -69,53 +68,53 @@ struct drectifier_functor {
 
 class Activation_function
 {
-        int size;
-	activation_function_type f;
-
-    public:
-        Activation_function() : size(0), f(Rectifier) { }
-
-	void resize(int size) { this->size = size; }
-	void set_activation_function(activation_function_type f) { this->f = f; }
-
-	template <typename Engine>
-	void initialize(Engine &engine, bool init_normal, double init_range) { }
-
-	int n_inputs () const { return size; }
-	int n_outputs () const { return size; }
-
-        template <typename DerivedIn, typename DerivedOut>
-	void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
-        {
-	    UNCONST(DerivedOut, output, my_output);
-
-	    switch (f)
-	    {
-	    case Identity: my_output = input; break;
-	    case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
-	    case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
-	    case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
-            case InvalidFunction: std::abort();
-	    }
-        }
-
-        template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
-	void bProp(const MatrixBase<DerivedGOut> &input, 
-      MatrixBase<DerivedGIn> &output,
-		   const MatrixBase<DerivedIn> &finput,
-       const MatrixBase<DerivedOut> &foutput) const
-        {
-	    UNCONST(DerivedGIn, output, my_output);
-
-	    switch (f)
-	    {
-	    case Identity: my_output = input; break;
-	    case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
-	    case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
-	    case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
-            case InvalidFunction: std::abort();
-	    }
-        }
+  int size;
+  activation_function_type f;
+
+ public:
+  Activation_function() : size(0), f(Rectifier) { }
+
+  void resize(int size) { this->size = size; }
+  void set_activation_function(activation_function_type f) { this->f = f; }
+
+  template <typename Engine>
+  void initialize(Engine &engine, bool init_normal, double init_range) { }
+
+  int n_inputs () const { return size; }
+  int n_outputs () const { return size; }
+
+  template <typename DerivedIn, typename DerivedOut>
+  void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
+  {
+    UNCONST(DerivedOut, output, my_output);
+
+    switch (f)
+    {
+      case Identity: my_output = input; break;
+      case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
+      case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
+      case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
+      case InvalidFunction: std::abort();
+    }
+  }
+
+  template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
+  void bProp(const MatrixBase<DerivedGOut> &input,
+             MatrixBase<DerivedGIn> &output,
+             const MatrixBase<DerivedIn> &finput,
+             const MatrixBase<DerivedOut> &foutput) const
+  {
+    UNCONST(DerivedGIn, output, my_output);
+
+    switch (f)
+    {
+      case Identity: my_output = input; break;
+      case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
+      case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
+      case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
+      case InvalidFunction: std::abort();
+    }
+  }
 };
 
 } // namespace nplm
diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h
index bc55762..d89cde6 100644
--- a/src/SoftmaxLoss.h
+++ b/src/SoftmaxLoss.h
@@ -1,7 +1,6 @@
-	#ifndef SOFTMAXLOSS_H
+#ifndef SOFTMAXLOSS_H
 #define SOFTMAXLOSS_H
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 #include "multinomial.h"
 #include "util.h"
@@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss };
 
 inline loss_function_type string_to_loss_function (const std::string &s)
 {
-    if (s == "log")
-        return LogLoss;
-    else if (s == "nce")
-        return NCELoss;
-    else
-        return InvalidLoss;
+  if (s == "log")
+    return LogLoss;
+  else if (s == "nce")
+    return NCELoss;
+  else
+    return InvalidLoss;
 }
 
 inline std::string loss_function_to_string (loss_function_type f)
 {
-    if (f == LogLoss)
-        return "log";
-    else if (f == NCELoss)
-        return "nce";
+  if (f == LogLoss)
+    return "log";
+  else if (f == NCELoss)
+    return "nce";
 }
 
 /// Note: Outputs log-probabilities.
 
 struct SoftmaxLogLoss
 {
-    template <typename DerivedI, typename DerivedW, typename DerivedO>
-    void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+  template <typename DerivedI, typename DerivedW, typename DerivedO>
+  void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+  {
+    UNCONST(DerivedO, output_const, output);
+
+    double log_likelihood = 0.0;
+
+#pragma omp parallel for reduction(+:log_likelihood)
+    for (int train_id = 0; train_id < input.cols(); train_id++)
     {
-        UNCONST(DerivedO, output_const, output);
-
-	double log_likelihood = 0.0;
-
-        #pragma omp parallel for reduction(+:log_likelihood)
-	for (int train_id = 0; train_id < input.cols(); train_id++)
-	{
-	    double normalization = logsum(input.col(train_id));
-	    output.col(train_id).array() = input.col(train_id).array() - normalization;
-	    log_likelihood += output(output_words(train_id), train_id);
-	}
-	loss = log_likelihood;
+      double normalization = logsum(input.col(train_id));
+      output.col(train_id).array() = input.col(train_id).array() - normalization;
+      log_likelihood += output(output_words(train_id), train_id);
     }
-
-    template <typename DerivedW, typename DerivedO, typename DerivedI>
-    void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+    loss = log_likelihood;
+  }
+
+  template <typename DerivedW, typename DerivedO, typename DerivedI>
+  void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+  {
+    UNCONST(DerivedI, grad_input_const, grad_input);
+    grad_input.setZero();
+#pragma omp parallel for
+    for (int train_id = 0; train_id < output.cols(); train_id++)
     {
-        UNCONST(DerivedI, grad_input_const, grad_input);
-        grad_input.setZero();
-        #pragma omp parallel for
-	for (int train_id = 0; train_id < output.cols(); train_id++)
-	{
-	    grad_input(output_words(train_id), train_id) += 1.;
-	    grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
-	}
+      grad_input(output_words(train_id), train_id) += 1.;
+      grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
     }
+  }
 };
 
 ///// Softmax layer plus NCE loss function.
@@ -81,55 +80,55 @@ struct SoftmaxLogLoss
 template <typename Multinomial>
 class SoftmaxNCELoss
 {
-    const Multinomial &unigram;
+  const Multinomial &unigram;
 
-public:
-    SoftmaxNCELoss(const Multinomial &unigram) 
+ public:
+  SoftmaxNCELoss(const Multinomial &unigram)
       : unigram(unigram)
+  {
+  }
+
+  template <typename DerivedI, typename DerivedW, typename DerivedO>
+  void fProp(const MatrixBase<DerivedI> &scores,
+             const MatrixBase<DerivedW> &minibatch_samples,
+             const MatrixBase<DerivedO> &output_const, double &loss)
+  {
+    UNCONST(DerivedO, output_const, output);
+    double log_likelihood = 0.0;
+    int num_noise_samples = minibatch_samples.rows()-1;
+    double log_num_noise_samples = std::log(num_noise_samples);
+#pragma omp parallel for reduction(+:log_likelihood) schedule(static)
+    for (int train_id = 0; train_id < scores.cols(); train_id++)
     {
+      for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
+      {
+        int sample = minibatch_samples(sample_id, train_id);
+        // To avoid zero or infinite probabilities,
+        // never take exp of score without normalizing first,
+        // even if it's a little slower...
+        double score = scores(sample_id, train_id);
+        double score_noise = log_num_noise_samples + unigram.logprob(sample);
+        double z = logadd(score, score_noise);
+        double logprob = score - z;
+        double logprob_noise = score_noise - z;
+        output(sample_id, train_id) = std::exp(logprob);
+        log_likelihood += sample_id == 0 ? logprob : logprob_noise;
+      }
     }
-
-    template <typename DerivedI, typename DerivedW, typename DerivedO>
-    void fProp(const MatrixBase<DerivedI> &scores, 
-	       const MatrixBase<DerivedW> &minibatch_samples,
-	       const MatrixBase<DerivedO> &output_const, double &loss)
-    {
-        UNCONST(DerivedO, output_const, output);
-	double log_likelihood = 0.0;
-	int num_noise_samples = minibatch_samples.rows()-1;
-	double log_num_noise_samples = std::log(num_noise_samples);
-        #pragma omp parallel for reduction(+:log_likelihood) schedule(static)
-	for (int train_id = 0; train_id < scores.cols(); train_id++)
-	{
-	    for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
-	    {
-	        int sample = minibatch_samples(sample_id, train_id);
-		// To avoid zero or infinite probabilities,
-		// never take exp of score without normalizing first,
-		// even if it's a little slower...
-		double score = scores(sample_id, train_id);
-		double score_noise = log_num_noise_samples + unigram.logprob(sample);
-		double z = logadd(score, score_noise);
-		double logprob = score - z;
-		double logprob_noise = score_noise - z;
-		output(sample_id, train_id) = std::exp(logprob);
-		log_likelihood += sample_id == 0 ? logprob : logprob_noise;
-	    }
-	}
-	loss = log_likelihood;
-    }
-
-    template <typename DerivedO, typename DerivedI>
-    void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+    loss = log_likelihood;
+  }
+
+  template <typename DerivedO, typename DerivedI>
+  void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+  {
+    UNCONST(DerivedI, output_const, output);
+#pragma omp parallel for schedule(static)
+    for (int train_id = 0; train_id < probs.cols(); train_id++)
     {
-        UNCONST(DerivedI, output_const, output);
-        #pragma omp parallel for schedule(static)
-	for (int train_id = 0; train_id < probs.cols(); train_id++)
-	{
-	    output.col(train_id) = -probs.col(train_id);
-	    output(0, train_id) += 1.0;
-	}
+      output.col(train_id) = -probs.col(train_id);
+      output(0, train_id) += 1.0;
     }
+  }
 };
 
 } // namespace nplm
diff --git a/src/USCMatrix.h b/src/USCMatrix.h
index 02aeb33..784fa1b 100644
--- a/src/USCMatrix.h
+++ b/src/USCMatrix.h
@@ -1,7 +1,6 @@
 #ifndef USCMATRIX_H
 #define USCMATRIX_H
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 #include "maybe_omp.h"
 #include "util.h"
@@ -34,108 +33,108 @@ template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_
 class USCMatrix
 {
 
-public:
-    Matrix<Index,Dynamic,Dynamic> indexes;
-    Matrix<Scalar,Dynamic,Dynamic> values;
-    int m_rows;
+ public:
+  Matrix<Index,Dynamic,Dynamic> indexes;
+  Matrix<Scalar,Dynamic,Dynamic> values;
+  int m_rows;
 
-    USCMatrix() : m_rows(0) { }
+  USCMatrix() : m_rows(0) { }
 
-    template <typename Indexes, typename Values>
-    USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) 
-    : 
-      indexes(indexes), 
-      values(values), 
-      m_rows(rows) 
-    { }
+  template <typename Indexes, typename Values>
+  USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values)
+      :
+      indexes(indexes),
+      values(values),
+      m_rows(rows)
+  { }
 
-    USCMatrix(Index rows, Index nnz, Index cols) 
-    : 
-      indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), 
+  USCMatrix(Index rows, Index nnz, Index cols)
+      :
+      indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)),
       values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)),
       m_rows(rows)
-    { 
-        this->indexes.fill(-1); 
-    }
-
-    Index rows() const { return m_rows; }
-    Index cols() const { return indexes.cols(); }
-
-    void resize(Index rows, Index nnz, Index cols) {
-        indexes.resize(nnz, cols);
-        values.resize(nnz, cols);
-	m_rows = rows;
-    }
+  {
+    this->indexes.fill(-1);
+  }
+
+  Index rows() const { return m_rows; }
+  Index cols() const { return indexes.cols(); }
+
+  void resize(Index rows, Index nnz, Index cols) {
+    indexes.resize(nnz, cols);
+    values.resize(nnz, cols);
+    m_rows = rows;
+  }
 };
 
 // Dense matrix - sparse matrix product
 // a is presumably very wide
 template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC>
-void uscgemm(double alpha, const MatrixBase<DerivedA> &a, 
-	     const USCMatrix<ScalarB,Index> &b,
-	     const MatrixBase<DerivedC> &c_const)
+void uscgemm(double alpha, const MatrixBase<DerivedA> &a,
+             const USCMatrix<ScalarB,Index> &b,
+             const MatrixBase<DerivedC> &c_const)
 {
-    UNCONST(DerivedC, c_const, c);
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == c.cols());
-
-    #pragma omp parallel for
-    for (Index k=0; k<b.cols(); k++)
-        for (Index r=0; r<b.indexes.rows(); r++)
-	{
-	    Index j = b.indexes(r,k);
-	    eigen_assert(j >= 0);
-	    eigen_assert(j < a.cols());
-	    c.col(k) += alpha * a.col(j) * b.values(r,k);
-	}
+  UNCONST(DerivedC, c_const, c);
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == c.cols());
+
+#pragma omp parallel for
+  for (Index k=0; k<b.cols(); k++)
+    for (Index r=0; r<b.indexes.rows(); r++)
+    {
+      Index j = b.indexes(r,k);
+      eigen_assert(j >= 0);
+      eigen_assert(j < a.cols());
+      c.col(k) += alpha * a.col(j) * b.values(r,k);
+    }
 }
 
 // sparse matrix - dense matrix product
 template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemm(double alpha, 
-	     const USCMatrix<ScalarA,Index> &a,
-	     const MatrixBase<DerivedB> &b, 
-	     const MatrixBase<DerivedC> &c_const)
+void uscgemm(double alpha,
+             const USCMatrix<ScalarA,Index> &a,
+             const MatrixBase<DerivedB> &b,
+             const MatrixBase<DerivedC> &c_const)
 {
-    UNCONST(DerivedC, c_const, c);
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == c.cols());
-
-    // This needs to be tuned for each system, unfortunately,
-    // and seems to vary a lot. A lot.
-    int i_blocks = omp_get_num_threads()*16;
-
-    // Assume only one block in k direction.
-    // We don't need to explicitly block in the j direction.
-    #pragma omp parallel for
-    for (Index ib=0; ib<i_blocks; ib++)
-        for (Index j=0; j<a.cols(); j++)
-	    for (Index r=0; r<a.indexes.rows(); r++)
-	    {
-	        Index i = a.indexes(r,j);
-		eigen_assert(i >= 0);
-		eigen_assert(i < c.rows());
-		if (i % i_blocks == ib)
-		    c.row(i) += alpha * a.values(r,j) * b.row(j);
-	    }
-
-    /*
+  UNCONST(DerivedC, c_const, c);
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == c.cols());
+
+  // This needs to be tuned for each system, unfortunately,
+  // and seems to vary a lot. A lot.
+  int i_blocks = omp_get_num_threads()*16;
+
+  // Assume only one block in k direction.
+  // We don't need to explicitly block in the j direction.
+#pragma omp parallel for
+  for (Index ib=0; ib<i_blocks; ib++)
+    for (Index j=0; j<a.cols(); j++)
+      for (Index r=0; r<a.indexes.rows(); r++)
+      {
+        Index i = a.indexes(r,j);
+        eigen_assert(i >= 0);
+        eigen_assert(i < c.rows());
+        if (i % i_blocks == ib)
+          c.row(i) += alpha * a.values(r,j) * b.row(j);
+      }
+
+  /*
     If c.cols() is really large, then theoretically it seems like we should do:
 
     parallel for blocks in i direction
-        for blocks in j direction
-            pack block of a into smaller sparse matrix
-            for blocks in k direction
-                for k
-                    for i (sparse)
-                        for j
-                            c(i,k) += a(i,j) * b(j,k)
+    for blocks in j direction
+    pack block of a into smaller sparse matrix
+    for blocks in k direction
+    for k
+    for i (sparse)
+    for j
+    c(i,k) += a(i,j) * b(j,k)
 
     However, the copying of blocks of a doesn't seem practical for any realistic
     sizes of c.cols().
-    */
+  */
 }
 
 // Dense matrix - dense matrix product, but masked by a sparse matrix,
@@ -147,45 +146,45 @@ void uscgemm(double alpha,
 
 template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index>
 void uscgemm_masked(double alpha,
-		    const MatrixBase<DerivedA> &a,
-		    const MatrixBase<DerivedB> &b,
-		    USCMatrix<ScalarC,Index> &c)
+                    const MatrixBase<DerivedA> &a,
+                    const MatrixBase<DerivedB> &b,
+                    USCMatrix<ScalarC,Index> &c)
 {
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == c.cols());
-
-    #pragma omp parallel for
-    for (Index k=0; k<b.cols(); k++)
-        for (Index r=0; r<c.indexes.rows(); r++)
-	{
-	    Index i = c.indexes(r, k);
-	    eigen_assert(i >= 0);
-	    eigen_assert(i < a.rows());
-	    c.values(r, k) += alpha * a.row(i) * b.col(k);
-	}
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == c.cols());
+
+#pragma omp parallel for
+  for (Index k=0; k<b.cols(); k++)
+    for (Index r=0; r<c.indexes.rows(); r++)
+    {
+      Index i = c.indexes(r, k);
+      eigen_assert(i >= 0);
+      eigen_assert(i < a.rows());
+      c.values(r, k) += alpha * a.row(i) * b.col(k);
+    }
 }
 
 // sparse matrix - dense vector product
 template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemv(double alpha, 
-	     const USCMatrix<ScalarA,Index> &a,
-	     const MatrixBase<DerivedB> &b,
-	     const MatrixBase<DerivedC> &c_const)
+void uscgemv(double alpha,
+             const USCMatrix<ScalarA,Index> &a,
+             const MatrixBase<DerivedB> &b,
+             const MatrixBase<DerivedC> &c_const)
 {
-    UNCONST(DerivedC, c_const, c);
-    eigen_assert(a.rows() == c.rows());
-    eigen_assert(a.cols() == b.rows());
-    eigen_assert(b.cols() == 1 && c.cols() == 1);
-
-    for (Index j=0; j<a.cols(); j++)
-        for (Index r=0; r<a.indexes.rows(); r++)
-	{
-	    Index i = a.indexes(r,j);
-	    eigen_assert(i >= 0);
-	    eigen_assert(i < c.rows());
-	    c(i) += alpha * a.values(r,j) * b(j);
-	}
+  UNCONST(DerivedC, c_const, c);
+  eigen_assert(a.rows() == c.rows());
+  eigen_assert(a.cols() == b.rows());
+  eigen_assert(b.cols() == 1 && c.cols() == 1);
+
+  for (Index j=0; j<a.cols(); j++)
+    for (Index r=0; r<a.indexes.rows(); r++)
+    {
+      Index i = a.indexes(r,j);
+      eigen_assert(i >= 0);
+      eigen_assert(i < c.rows());
+      c(i) += alpha * a.values(r,j) * b(j);
+    }
 }
 
 }
diff --git a/src/graphClasses.h b/src/graphClasses.h
index d3c0c4a..cd80a4c 100644
--- a/src/graphClasses.h
+++ b/src/graphClasses.h
@@ -3,7 +3,6 @@
 
 #include <cstdlib>
 #include "neuralClasses.h"
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 namespace nplm
@@ -11,50 +10,50 @@ namespace nplm
 
 template <class X>
 class Node {
-    public:
-        X * param; //what parameter is this
-        //vector <void *> children;
-        //vector <void *> parents;
-	Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
-	Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
-	int minibatch_size;
-
-    public:
-        Node() : param(NULL), minibatch_size(0) { }
-
-        Node(X *input_param, int minibatch_size)
-	  : param(input_param),
-	    minibatch_size(minibatch_size)
-        {
-	    resize(minibatch_size);
-        }
-
-	void resize(int minibatch_size)
-	{
-	    this->minibatch_size = minibatch_size;
-	    if (param->n_outputs() != -1)
-	    {
-	        fProp_matrix.setZero(param->n_outputs(), minibatch_size);
-	    }
-            if (param->n_inputs() != -1)
-            {
-	        bProp_matrix.setZero(param->n_inputs(), minibatch_size);
-            }
-	}
-
-	void resize() { resize(minibatch_size); }
-
-        /*
-        void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
-        {
-            param->fProp(input,fProp_matrix,0,0,n_cols);
-        }
-        void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
-        {
-            param->fProp(input,fProp_matrix,0,0,n_cols);
-        }
-        */
-        //for f prop, just call the fProp node of the particular parameter. 
+ public:
+  X * param; //what parameter is this
+  //vector <void *> children;
+  //vector <void *> parents;
+  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
+  Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
+  int minibatch_size;
+
+ public:
+  Node() : param(NULL), minibatch_size(0) { }
+
+  Node(X *input_param, int minibatch_size)
+      : param(input_param),
+        minibatch_size(minibatch_size)
+  {
+    resize(minibatch_size);
+  }
+
+  void resize(int minibatch_size)
+  {
+    this->minibatch_size = minibatch_size;
+    if (param->n_outputs() != -1)
+    {
+      fProp_matrix.setZero(param->n_outputs(), minibatch_size);
+    }
+    if (param->n_inputs() != -1)
+    {
+      bProp_matrix.setZero(param->n_inputs(), minibatch_size);
+    }
+  }
+
+  void resize() { resize(minibatch_size); }
+
+  /*
+    void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
+    {
+    param->fProp(input,fProp_matrix,0,0,n_cols);
+    }
+    void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
+    {
+    param->fProp(input,fProp_matrix,0,0,n_cols);
+    }
+  */
+  //for f prop, just call the fProp node of the particular parameter.
 
 };
 
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 26dae06..ee7c3f0 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -6,8 +6,7 @@
 #include <cmath>
 #include <vector>
 
-#include <boost/unordered_map.hpp> 
-//#include <../3rdparty/Eigen/Dense>
+#include <boost/unordered_map.hpp>
 #include <Eigen/Dense>
 #include "maybe_omp.h"
 
@@ -35,7 +34,7 @@ using Eigen::Dynamic;
 typedef boost::unordered_map<int,bool> int_map;
 
 struct Clipper{
-  double operator() (double x) const { 
+  double operator() (double x) const {
     return std::min(0.5, std::max(x,-0.5));
     //return(x);
   }
@@ -44,7 +43,7 @@ struct Clipper{
 
 class Linear_layer
 {
-    private: 
+    private:
         Matrix<double,Dynamic,Dynamic> U;
         Matrix<double,Dynamic,Dynamic> U_gradient;
         Matrix<double,Dynamic,Dynamic> U_velocity;
@@ -60,12 +59,12 @@ class Linear_layer
     friend class model;
 
     public:
-	Linear_layer() { }
+  Linear_layer() { }
         Linear_layer(int rows, int cols) { resize(rows, cols); }
 
-	void resize(int rows, int cols)
-	{
-	    U.setZero(rows, cols);
+  void resize(int rows, int cols)
+  {
+      U.setZero(rows, cols);
       U_gradient.setZero(rows, cols);
       //U_running_gradient.setZero(rows, cols);
       //U_running_parameter_updates.setZero(rows, cols);
@@ -74,21 +73,21 @@ class Linear_layer
       b_gradient.setZero(rows);
       //b_running_gradient.resize(rows);
       //b_velocity.resize(rows);
-	}
+  }
 
-	void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
-	void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
+  void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
+  void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
   void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
   void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
 
 
-	template <typename Engine>
-	void initialize(Engine &engine,
+  template <typename Engine>
+  void initialize(Engine &engine,
       bool init_normal,
       double init_range,
       string &parameter_update,
       double adagrad_epsilon)
-	{
+  {
       if (parameter_update == "ADA") {
         U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
         b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
@@ -100,58 +99,58 @@ class Linear_layer
         b_running_parameter_update.setZero(b.size());
       }
 
-	    initMatrix(engine, U, init_normal, init_range);
+      initMatrix(engine, U, init_normal, init_range);
       initBias(engine, b, init_normal, init_range);
-	}	  
+  }
 
-	int n_inputs () const { return U.cols(); }
-	int n_outputs () const { return U.rows(); }
+  int n_inputs () const { return U.cols(); }
+  int n_outputs () const { return U.rows(); }
 
   template <typename DerivedIn, typename DerivedOut>
-	void fProp(const MatrixBase<DerivedIn> &input,
+  void fProp(const MatrixBase<DerivedIn> &input,
       const MatrixBase<DerivedOut> &output) const
   {
       UNCONST(DerivedOut, output, my_output);
       my_output.leftCols(input.cols()).noalias() = U*input;
       int num_examples = input.cols();
-      for (int example = 0;example < num_examples;example++) 
+      for (int example = 0;example < num_examples;example++)
       {
           my_output.leftCols(input.cols()).col(example) += b;
       }
   }
 
-	// Sparse input
+  // Sparse input
   template <typename ScalarIn, typename DerivedOut>
-	void fProp(const USCMatrix<ScalarIn> &input,
+  void fProp(const USCMatrix<ScalarIn> &input,
       const MatrixBase<DerivedOut> &output_const) const
   {
-	    UNCONST(DerivedOut, output_const, output);
-	    output.setZero();
-	    uscgemm(1.0, U, input, output.leftCols(input.cols()));
-      // Each column corresponds to a training example. We 
+      UNCONST(DerivedOut, output_const, output);
+      output.setZero();
+      uscgemm(1.0, U, input, output.leftCols(input.cols()));
+      // Each column corresponds to a training example. We
       // parallelize the adding of biases per dimension.
       int num_examples = input.cols();
-      for (int example = 0;example < num_examples;example++) 
+      for (int example = 0;example < num_examples;example++)
       {
           output.leftCols(input.cols()).col(example) += b;
       }
   }
 
   template <typename DerivedGOut, typename DerivedGIn>
-	void bProp(const MatrixBase<DerivedGOut> &input,
+  void bProp(const MatrixBase<DerivedGOut> &input,
       MatrixBase<DerivedGIn> &output) const
   {
-	    UNCONST(DerivedGIn, output, my_output);
-	    my_output.noalias() = U.transpose()*input;
-	}
+      UNCONST(DerivedGIn, output, my_output);
+      my_output.noalias() = U.transpose()*input;
+  }
 
   template <typename DerivedGOut, typename DerivedIn>
-  void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, 
-     const MatrixBase<DerivedIn> &fProp_input, 
+  void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
+     const MatrixBase<DerivedIn> &fProp_input,
      double learning_rate, double momentum, double L2_reg)
   {
       U_gradient.noalias() = bProp_input*fProp_input.transpose();
-      
+
       // get the bias gradient for all dimensions in parallel
       int size = b.size();
       b_gradient = bProp_input.rowwise().sum();
@@ -172,7 +171,7 @@ class Linear_layer
       {
           U += learning_rate * U_gradient;
           b += learning_rate * b_gradient;
-          /* 
+          /*
           //UPDATE CLIPPING
           U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
           b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
@@ -181,17 +180,17 @@ class Linear_layer
           //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
           */
       }
-	}
+  }
 
   template <typename DerivedGOut, typename DerivedIn>
-  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
-      const MatrixBase<DerivedIn> &fProp_input, 
+  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+      const MatrixBase<DerivedIn> &fProp_input,
       double learning_rate,
       double L2_reg)
   {
       U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-      
+
       // get the bias gradient for all dimensions in parallel
       int size = b.size();
       b_gradient.noalias() = bProp_input.rowwise().sum();
@@ -206,7 +205,7 @@ class Linear_layer
       #pragma omp parallel for
       for (int col=0; col<U.cols(); col++) {
         U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
-        U.col(col) += learning_rate * (U_gradient.col(col).array() / 
+        U.col(col) += learning_rate * (U_gradient.col(col).array() /
                   U_running_gradient.col(col).array().sqrt()).matrix();
         /*
         //UPDATE CLIPPING
@@ -223,8 +222,8 @@ class Linear_layer
   }
 
   template <typename DerivedGOut, typename DerivedIn>
-  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, 
-      const MatrixBase<DerivedIn> &fProp_input, 
+  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+      const MatrixBase<DerivedIn> &fProp_input,
       double learning_rate,
       double L2_reg,
       double conditioning_constant,
@@ -234,7 +233,7 @@ class Linear_layer
       U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
       Array<double,Dynamic,1> b_current_parameter_update;
-      
+
       // get the bias gradient for all dimensions in parallel
       int size = b.size();
       b_gradient.noalias() = bProp_input.rowwise().sum();
@@ -250,7 +249,7 @@ class Linear_layer
       //cerr<<"U gradient is "<<U_gradient<<endl;
       for (int col=0; col<U.cols(); col++) {
         Array<double,Dynamic,1> U_current_parameter_update;
-        U_running_gradient.col(col) = decay*U_running_gradient.col(col) + 
+        U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
                             (1-decay)*U_gradient.col(col).array().square().matrix();
         //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
         //getchar();
@@ -262,22 +261,22 @@ class Linear_layer
         //update the running parameter update
         U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
                                           (1.-decay)*U_current_parameter_update.square().matrix();
-        U.col(col) += learning_rate*U_current_parameter_update.matrix();  
+        U.col(col) += learning_rate*U_current_parameter_update.matrix();
       }
-      b_running_gradient = decay*b_running_gradient + 
+      b_running_gradient = decay*b_running_gradient +
                         (1.-decay)*b_gradient.array().square().matrix();
       b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
                                    (b_running_gradient.array()+conditioning_constant).sqrt()) *
                                   b_gradient.array();
-      b_running_parameter_update = decay*(b_running_parameter_update) + 
+      b_running_parameter_update = decay*(b_running_parameter_update) +
                                 (1.-decay)*b_current_parameter_update.square().matrix();
       b += learning_rate*b_current_parameter_update.matrix();
   }
 
 
   template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
-    const MatrixBase<DerivedIn> &fProp_input, 
+  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+    const MatrixBase<DerivedIn> &fProp_input,
     const MatrixBase<DerivedGW> &gradient) const
   {
       UNCONST(DerivedGW, gradient, my_gradient);
@@ -355,17 +354,17 @@ class Output_word_embeddings
     template <typename DerivedIn, typename DerivedOut>
     void fProp(const MatrixBase<DerivedIn> &input,
     const MatrixBase<DerivedOut> &output) const
-	  {
+    {
         UNCONST(DerivedOut, output, my_output);
         my_output = ((*W) * input).colwise() + b;
-	  }
+    }
 
-	// Sparse output version
+  // Sparse output version
     template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
     void fProp(const MatrixBase<DerivedIn> &input,
     const MatrixBase<DerivedOutI> &samples,
     const MatrixBase<DerivedOutV> &output) const
-	  {
+    {
         UNCONST(DerivedOutV, output, my_output);
         #pragma omp parallel for
         for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
@@ -378,13 +377,13 @@ class Output_word_embeddings
         USCMatrix<double> sparse_output(W->rows(), samples, my_output);
         uscgemm_masked(1.0, *W, input, sparse_output);
         my_output = sparse_output.values; // too bad, so much copying
-	  }
+    }
 
     // Return single element of output matrix
     template <typename DerivedIn>
-    double fProp(const MatrixBase<DerivedIn> &input, 
+    double fProp(const MatrixBase<DerivedIn> &input,
            int word,
-           int instance) const 
+           int instance) const
     {
         return W->row(word).dot(input.col(instance)) + b(word);
     }
@@ -395,19 +394,19 @@ class Output_word_embeddings
     void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
     const MatrixBase<DerivedGIn> &bProp_matrix) const
     {
-	    // W is vocab_size x output_embedding_dimension
-	    // input_bProp_matrix is vocab_size x minibatch_size
-	    // bProp_matrix is output_embedding_dimension x minibatch_size
-	    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
-	    my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
+      // W is vocab_size x output_embedding_dimension
+      // input_bProp_matrix is vocab_size x minibatch_size
+      // bProp_matrix is output_embedding_dimension x minibatch_size
+      UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+      my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
         W->transpose() * input_bProp_matrix;
-	  }
+    }
 
     template <typename DerivedIn, typename DerivedGOut>
           void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
              const MatrixBase<DerivedGOut> &bProp_input,
              double learning_rate,
-             double momentum) //not sure if we want 	to use momentum here
+             double momentum) //not sure if we want   to use momentum here
     {
         // W is vocab_size x output_embedding_dimension
         // b is vocab_size x 1
@@ -418,15 +417,15 @@ class Output_word_embeddings
 
         /*
         //GRADIENT CLIPPING
-        W->noalias() += learning_rate * 
+        W->noalias() += learning_rate *
           ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
         b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
         //UPDATE CLIPPING
-        W->noalias() += (learning_rate * 
+        W->noalias() += (learning_rate *
         (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
         b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
         */
-	  }
+    }
 
     template <typename DerivedIn, typename DerivedGOut>
           void computeGradientAdagrad(
@@ -451,7 +450,7 @@ class Output_word_embeddings
         *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
         b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
         */
-	  }
+    }
 
     template <typename DerivedIn, typename DerivedGOut>
           void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
@@ -480,14 +479,14 @@ class Output_word_embeddings
         b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
                                      (b_running_gradient.array()+conditioning_constant).sqrt())*
                                      b_gradient.array();
-        W_running_parameter_update = decay*W_running_parameter_update + 
+        W_running_parameter_update = decay*W_running_parameter_update +
                                     (1.-decay)*W_current_parameter_update.square().matrix();
         b_running_parameter_update = decay*b_running_parameter_update +
                                     (1.-decay)*b_current_parameter_update.square().matrix();
 
         *W += learning_rate*W_current_parameter_update.matrix();
         b += learning_rate*b_current_parameter_update.matrix();
-	  }
+    }
 
     // Sparse versions
 
@@ -499,46 +498,46 @@ class Output_word_embeddings
         UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
         my_bProp_matrix.setZero();
         uscgemm(1.0,
-            W->transpose(), 
+            W->transpose(),
             USCMatrix<double>(W->rows(), samples, weights),
             my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
     }
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
         void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
-			     const MatrixBase<DerivedGOutI> &samples,
-			     const MatrixBase<DerivedGOutV> &weights,
-			     double learning_rate, double momentum) //not sure if we want to use momentum here
-	{
+           const MatrixBase<DerivedGOutI> &samples,
+           const MatrixBase<DerivedGOutV> &weights,
+           double learning_rate, double momentum) //not sure if we want to use momentum here
+  {
       //cerr<<"in gradient"<<endl;
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(learning_rate,
+      USCMatrix<double> gradient_output(W->rows(), samples, weights);
+      uscgemm(learning_rate,
           gradient_output,
           predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
           *W); // narrow predicted_embeddings for possible short minibatch
-	    uscgemv(learning_rate,
+      uscgemv(learning_rate,
           gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+          Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
           b);
       /*
       //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
       //FIRST
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
+      USCMatrix<double> gradient_output(W->rows(), samples, weights);
+      uscgemm(1.0,
           gradient_output,
           predicted_embeddings.leftCols(samples.cols()).transpose(),
           W_gradient);
-	    uscgemv(1.0, 
+      uscgemv(1.0,
           gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          Matrix<double,Dynamic,1>::Ones(weights.cols()),
           b_gradient);
 
       int_map update_map; //stores all the parameters that have been updated
       for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-	        for (int train_id=0; train_id<samples.cols(); train_id++)
-		          update_map[samples(sample_id, train_id)] = 1;
+          for (int train_id=0; train_id<samples.cols(); train_id++)
+              update_map[samples(sample_id, train_id)] = 1;
 
-	    // Convert to std::vector for parallelization
+      // Convert to std::vector for parallelization
         std::vector<int> update_items;
         for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
             update_items.push_back(it->first);
@@ -560,33 +559,33 @@ class Output_word_embeddings
         }
         */
       //cerr<<"Finished gradient"<<endl;
-	}
+  }
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
         void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
-				    const MatrixBase<DerivedGOutI> &samples,
-				    const MatrixBase<DerivedGOutV> &weights,
-				    double learning_rate) //not sure if we want to use momentum here
+            const MatrixBase<DerivedGOutI> &samples,
+            const MatrixBase<DerivedGOutV> &weights,
+            double learning_rate) //not sure if we want to use momentum here
         {
-	    //W_gradient.setZero(W->rows(), W->cols());
-	    //b_gradient.setZero(b.size());
+      //W_gradient.setZero(W->rows(), W->cols());
+      //b_gradient.setZero(b.size());
       //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
+      USCMatrix<double> gradient_output(W->rows(), samples, weights);
+      uscgemm(1.0,
           gradient_output,
           predicted_embeddings.leftCols(samples.cols()).transpose(),
           W_gradient);
-	    uscgemv(1.0, 
+      uscgemv(1.0,
           gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          Matrix<double,Dynamic,1>::Ones(weights.cols()),
           b_gradient);
 
       int_map update_map; //stores all the parameters that have been updated
       for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-	        for (int train_id=0; train_id<samples.cols(); train_id++)
-		          update_map[samples(sample_id, train_id)] = 1;
+          for (int train_id=0; train_id<samples.cols(); train_id++)
+              update_map[samples(sample_id, train_id)] = 1;
 
-	    // Convert to std::vector for parallelization
+      // Convert to std::vector for parallelization
         std::vector<int> update_items;
         for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
             update_items.push_back(it->first);
@@ -611,34 +610,34 @@ class Output_word_embeddings
         }
       }
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
         void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
-				    const MatrixBase<DerivedGOutI> &samples,
-				    const MatrixBase<DerivedGOutV> &weights,
-				    double learning_rate,
+            const MatrixBase<DerivedGOutI> &samples,
+            const MatrixBase<DerivedGOutV> &weights,
+            double learning_rate,
             double conditioning_constant,
             double decay) //not sure if we want to use momentum here
         {
           //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
-	    //W_gradient.setZero(W->rows(), W->cols());
-	    //b_gradient.setZero(b.size());
+      //W_gradient.setZero(W->rows(), W->cols());
+      //b_gradient.setZero(b.size());
 
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
+      USCMatrix<double> gradient_output(W->rows(), samples, weights);
+      uscgemm(1.0,
           gradient_output,
           predicted_embeddings.leftCols(samples.cols()).transpose(),
           W_gradient);
-	    uscgemv(1.0, 
+      uscgemv(1.0,
           gradient_output,
-		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          Matrix<double,Dynamic,1>::Ones(weights.cols()),
           b_gradient);
 
       int_map update_map; //stores all the parameters that have been updated
       for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-	        for (int train_id=0; train_id<samples.cols(); train_id++)
-		          update_map[samples(sample_id, train_id)] = 1;
+          for (int train_id=0; train_id<samples.cols(); train_id++)
+              update_map[samples(sample_id, train_id)] = 1;
 
-	    // Convert to std::vector for parallelization
+      // Convert to std::vector for parallelization
         std::vector<int> update_items;
         for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
             update_items.push_back(it->first);
@@ -685,24 +684,24 @@ class Output_word_embeddings
       }
 
 
-	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
+  template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
     void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
       const MatrixBase<DerivedGOutI> &samples,
       const MatrixBase<DerivedGOutV> &weights,
       const MatrixBase<DerivedGW> &gradient_W,
       const MatrixBase<DerivedGb> &gradient_b) const
   {
-	    UNCONST(DerivedGW, gradient_W, my_gradient_W);
-	    UNCONST(DerivedGb, gradient_b, my_gradient_b);
-	    my_gradient_W.setZero();
-	    my_gradient_b.setZero();
-	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(1.0,
+      UNCONST(DerivedGW, gradient_W, my_gradient_W);
+      UNCONST(DerivedGb, gradient_b, my_gradient_b);
+      my_gradient_W.setZero();
+      my_gradient_b.setZero();
+      USCMatrix<double> gradient_output(W->rows(), samples, weights);
+      uscgemm(1.0,
           gradient_output,
           predicted_embeddings.leftCols(samples.cols()).transpose(),
           my_gradient_W);
-	    uscgemv(1.0, gradient_output,
-		    Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+      uscgemv(1.0, gradient_output,
+        Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
   }
 };
 
@@ -715,12 +714,12 @@ class Input_word_embeddings
         Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
         Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
 
-	friend class model;
+  friend class model;
 
     public:
         Input_word_embeddings() : context_size(0), vocab_size(0) { }
         Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
- 
+
       void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
         W = input_W;
       }
@@ -747,7 +746,7 @@ class Input_word_embeddings
           if (parameter_update == "ADA") {
             W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
             //W_gradient.setZero(W->rows(),W->cols());
-          } 
+          }
         if (parameter_update == "ADAD") {
           W_running_gradient.setZero(W->rows(),W->cols());
           //W_gradient.setZero(W->rows(),W->cols());
@@ -759,59 +758,59 @@ class Input_word_embeddings
             init_range);
       }
 
-	int n_inputs() const { return -1; }
-	int n_outputs() const { return W->cols() * context_size; }
+  int n_inputs() const { return -1; }
+  int n_outputs() const { return W->cols() * context_size; }
 
-	// set output_id's embedding to the weighted average of all embeddings
-	template <typename Dist>
-	void average(const Dist &dist, int output_id)
-	{
-	    W->row(output_id).setZero();
-	    for (int i=0; i < W->rows(); i++)
-	        if (i != output_id)
-		    W->row(output_id) += dist.prob(i) * W->row(i);
-	}
+  // set output_id's embedding to the weighted average of all embeddings
+  template <typename Dist>
+  void average(const Dist &dist, int output_id)
+  {
+      W->row(output_id).setZero();
+      for (int i=0; i < W->rows(); i++)
+          if (i != output_id)
+        W->row(output_id) += dist.prob(i) * W->row(i);
+  }
 
-	template <typename DerivedIn, typename DerivedOut>
+  template <typename DerivedIn, typename DerivedOut>
         void fProp(const MatrixBase<DerivedIn> &input,
-		   const MatrixBase<DerivedOut> &output) const
+       const MatrixBase<DerivedOut> &output) const
         {
             int embedding_dimension = W->cols();
 
-	    // W      is vocab_size                        x embedding_dimension
-	    // input  is ngram_size*vocab_size             x minibatch_size
-	    // output is ngram_size*embedding_dimension x minibatch_size
-
-	    /* 
-	    // Dense version:
-	    for (int ngram=0; ngram<context_size; ngram++)
-	        output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
-	    */
-
-	    UNCONST(DerivedOut, output, my_output);
-	    my_output.setZero();
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        // input might be narrower than expected due to a short minibatch,
-	        // so narrow output to match
-	        uscgemm(1.0,
-            W->transpose(), 
+      // W      is vocab_size                        x embedding_dimension
+      // input  is ngram_size*vocab_size             x minibatch_size
+      // output is ngram_size*embedding_dimension x minibatch_size
+
+      /*
+      // Dense version:
+      for (int ngram=0; ngram<context_size; ngram++)
+          output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
+      */
+
+      UNCONST(DerivedOut, output, my_output);
+      my_output.setZero();
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
+          // input might be narrower than expected due to a short minibatch,
+          // so narrow output to match
+          uscgemm(1.0,
+            W->transpose(),
             USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
             my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
-	    }
+      }
         }
 
-	// When model is premultiplied, this layer doesn't get used,
-	// but this method is used to get the input into a sparse matrix.
-	// Hopefully this can get eliminated someday
-	template <typename DerivedIn, typename ScalarOut>
-	void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
-	{
-	  output.resize(vocab_size*context_size, context_size, input.cols());
-	  for (int i=0; i < context_size; i++)
-	    output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
-	  output.values.fill(1.0);
-	}
+  // When model is premultiplied, this layer doesn't get used,
+  // but this method is used to get the input into a sparse matrix.
+  // Hopefully this can get eliminated someday
+  template <typename DerivedIn, typename ScalarOut>
+  void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
+  {
+    output.resize(vocab_size*context_size, context_size, input.cols());
+    for (int i=0; i < context_size; i++)
+      output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
+    output.values.fill(1.0);
+  }
 
   template <typename DerivedGOut, typename DerivedIn>
   void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
@@ -820,45 +819,45 @@ class Input_word_embeddings
   {
       int embedding_dimension = W->cols();
 
-	    // W           is vocab_size                        x embedding_dimension
-	    // input       is ngram_size*vocab_size             x minibatch_size
-	    // bProp_input is ngram_size*embedding_dimension x minibatch_size
+      // W           is vocab_size                        x embedding_dimension
+      // input       is ngram_size*vocab_size             x minibatch_size
+      // bProp_input is ngram_size*embedding_dimension x minibatch_size
 
-	    /*
-	    // Dense version:
-	    for (int ngram=0; ngram<context_size; ngram++)
-	        W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
-	    */
+      /*
+      // Dense version:
+      for (int ngram=0; ngram<context_size; ngram++)
+          W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
+      */
 
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        uscgemm(learning_rate, 
-			USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
-      	  	*W);
-	    }
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
+          uscgemm(learning_rate,
+      USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+      bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
+            *W);
+      }
 
       /*
       //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
       //PERFORM CLIPPING WHILE UPDATING
 
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	      uscgemm(1.0, 
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
+        uscgemm(1.0,
           USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
           bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
           W_gradient);
-	    }
+      }
       int_map update_map; //stores all the parameters that have been updated
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
         for (int train_id=0; train_id<input_words.cols(); train_id++)
         {
           update_map[input_words(ngram,train_id)] = 1;
         }
       }
 
-	    // Convert to std::vector for parallelization
+      // Convert to std::vector for parallelization
         std::vector<int> update_items;
         for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
         {
@@ -884,33 +883,33 @@ class Input_word_embeddings
 
     template <typename DerivedGOut, typename DerivedIn>
     void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
-				    const MatrixBase<DerivedIn> &input_words,
-				    double learning_rate,
+            const MatrixBase<DerivedIn> &input_words,
+            double learning_rate,
             double L2_reg)
     {
             int embedding_dimension = W->cols();
-	    //W_gradient.setZero(W->rows(), W->cols());
+      //W_gradient.setZero(W->rows(), W->cols());
       /*
       if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
         W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
       */
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        uscgemm(1.0, 
-			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
+          uscgemm(1.0,
+      USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+      bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
       W_gradient);
-	    }
+      }
       int_map update_map; //stores all the parameters that have been updated
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
         for (int train_id=0; train_id<input_words.cols(); train_id++)
         {
           update_map[input_words(ngram,train_id)] = 1;
         }
       }
 
-	    // Convert to std::vector for parallelization
+      // Convert to std::vector for parallelization
         std::vector<int> update_items;
         for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
         {
@@ -923,11 +922,11 @@ class Input_word_embeddings
         {
             int update_item = update_items[item_id];
             W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
-            W->row(update_item) += learning_rate * 
+            W->row(update_item) += learning_rate *
               (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
             /*
             //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate * 
+            W->row(update_item) += (learning_rate *
               (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
                       .unaryExpr(Clipper()).matrix();
             */
@@ -937,36 +936,36 @@ class Input_word_embeddings
 
     template <typename DerivedGOut, typename DerivedIn>
     void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
-				    const MatrixBase<DerivedIn> &input_words,
-				    double learning_rate,
+            const MatrixBase<DerivedIn> &input_words,
+            double learning_rate,
             double L2_reg,
             double conditioning_constant,
             double decay)
     {
       int embedding_dimension = W->cols();
 
-	    //W_gradient.setZero(W->rows(), W->cols());
+      //W_gradient.setZero(W->rows(), W->cols());
       /*
       if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
         W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
       */
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
-	        uscgemm(1.0, 
-			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
+          uscgemm(1.0,
+      USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+      bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
       W_gradient);
-	    }
+      }
       int_map update_map; //stores all the parameters that have been updated
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    {
+      for (int ngram=0; ngram<context_size; ngram++)
+      {
         for (int train_id=0; train_id<input_words.cols(); train_id++)
         {
           update_map[input_words(ngram,train_id)] = 1;
         }
       }
 
-	    // Convert to std::vector for parallelization
+      // Convert to std::vector for parallelization
         std::vector<int> update_items;
         for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
         {
@@ -1006,16 +1005,15 @@ class Input_word_embeddings
       int x, int minibatch_size,
       const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
     {
-	    UNCONST(DerivedGW, gradient, my_gradient);
+      UNCONST(DerivedGW, gradient, my_gradient);
             int embedding_dimension = W->cols();
-	    my_gradient.setZero();
-	    for (int ngram=0; ngram<context_size; ngram++)
-	    uscgemm(1.0, 
-			  USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-			  bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+      my_gradient.setZero();
+      for (int ngram=0; ngram<context_size; ngram++)
+      uscgemm(1.0,
+        USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+        bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
         my_gradient);
     }
 };
 
 } // namespace nplm
-
diff --git a/src/neuralLM.h b/src/neuralLM.h
index 2004596..c18485f 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -6,7 +6,6 @@
 #include <cstdlib>
 #include <boost/shared_ptr.hpp>
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "util.h"
@@ -16,109 +15,109 @@
 /*
   To do:
   - move digit mapping into vocabulary.h
- */
+*/
 
 namespace nplm
 {
 
 class neuralLM : public neuralNetwork
 {
-    char map_digits;
-    boost::shared_ptr<vocabulary> vocab;
-    int start, null;
+  char map_digits;
+  boost::shared_ptr<vocabulary> vocab;
+  int start, null;
 
-public:
-    neuralLM() 
+ public:
+  neuralLM()
       : neuralNetwork(),
         vocab(new vocabulary()),
-	map_digits(0)
-    { 
-    }
+        map_digits(0)
+  {
+  }
 
-    void set_map_digits(char value) { map_digits = value; }
+  void set_map_digits(char value) { map_digits = value; }
 
-    void set_vocabulary(const vocabulary &vocab)
-    {
-        *(this->vocab) = vocab;
-        start = vocab.lookup_word("<s>");
-        null = vocab.lookup_word("<null>");
-    }
+  void set_vocabulary(const vocabulary &vocab)
+  {
+    *(this->vocab) = vocab;
+    start = vocab.lookup_word("<s>");
+    null = vocab.lookup_word("<null>");
+  }
 
-    const vocabulary &get_vocabulary() const { return *(this->vocab); }
+  const vocabulary &get_vocabulary() const { return *(this->vocab); }
 
-    int lookup_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return vocab->lookup_word(mapped_word);
-		}
-        return vocab->lookup_word(word);
-    }
+  int lookup_word(const std::string &word) const
+  {
+    if (map_digits)
+      for (int i=0; i<word.length(); i++)
+        if (isdigit(word[i]))
+        {
+          std::string mapped_word(word);
+          for (; i<word.length(); i++)
+            if (isdigit(word[i]))
+              mapped_word[i] = map_digits;
+          return vocab->lookup_word(mapped_word);
+        }
+    return vocab->lookup_word(word);
+  }
 
-    double lookup_ngram(const int *ngram_a, int n)
+  double lookup_ngram(const int *ngram_a, int n)
+  {
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+    for (int i=0; i<m->ngram_size; i++)
     {
-        Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
-	for (int i=0; i<m->ngram_size; i++)
-	{
-	    if (i-m->ngram_size+n < 0)
-	    {
-		if (ngram_a[0] == start)
-		    ngram(i) = start;
-		else
-		    ngram(i) = null;
-	    }
-	    else
-	    {
-	        ngram(i) = ngram_a[i-m->ngram_size+n];
-	    }
-	}
-	return neuralNetwork::lookup_ngram(ngram);
+      if (i-m->ngram_size+n < 0)
+      {
+        if (ngram_a[0] == start)
+          ngram(i) = start;
+        else
+          ngram(i) = null;
+      }
+      else
+      {
+        ngram(i) = ngram_a[i-m->ngram_size+n];
+      }
     }
+    return neuralNetwork::lookup_ngram(ngram);
+  }
 
-    double lookup_ngram(const std::vector<int> &ngram_v)
-    {
-        return lookup_ngram(ngram_v.data(), ngram_v.size());
-    }
+  double lookup_ngram(const std::vector<int> &ngram_v)
+  {
+    return lookup_ngram(ngram_v.data(), ngram_v.size());
+  }
 
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
-    {
-        return neuralNetwork::lookup_ngram(ngram);
-    }
-    
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        return neuralNetwork::lookup_ngram(ngram, log_probs_const);
-    }
+  template <typename Derived>
+  double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+  {
+    return neuralNetwork::lookup_ngram(ngram);
+  }
 
-    void read(const std::string &filename)
-    {
-        std::vector<std::string> words;
-        m->read(filename, words);
-        set_vocabulary(vocabulary(words));
-        resize();
-	// this is faster but takes more memory
-        //m->premultiply();
-    }
+  template <typename DerivedA, typename DerivedB>
+  void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+  {
+    return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+  }
+
+  void read(const std::string &filename)
+  {
+    std::vector<std::string> words;
+    m->read(filename, words);
+    set_vocabulary(vocabulary(words));
+    resize();
+    // this is faster but takes more memory
+    //m->premultiply();
+  }
 
 };
 
 template <typename T>
 void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop)
 {
-    output.clear();
-    output.resize(input.size()+ngram_size);
-    for (int i=0; i<ngram_size-1; i++)
-        output[i] = start;
-    std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
-    output[output.size()-1] = stop;
+  output.clear();
+  output.resize(input.size()+ngram_size);
+  for (int i=0; i<ngram_size-1; i++)
+    output[i] = start;
+  std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
+  output[output.size()-1] = stop;
 }
 
 template <typename T>
@@ -127,21 +126,21 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu
   output.clear();
   for (int j=ngram_size-1; j<input.size(); j++)
   {
-      std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
-      output.push_back(ngram);
+    std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
+    output.push_back(ngram);
   }
 }
 
-inline void preprocessWords(const std::vector<std::string> &words, 
-    std::vector< std::vector<int> > &ngrams,
-	  int ngram_size, 
-    const vocabulary &vocab, 
-	  bool numberize,
-    bool add_start_stop,
-    bool ngramize) {
+inline void preprocessWords(const std::vector<std::string> &words,
+                            std::vector< std::vector<int> > &ngrams,
+                            int ngram_size,
+                            const vocabulary &vocab,
+                            bool numberize,
+                            bool add_start_stop,
+                            bool ngramize) {
   int start = vocab.lookup_word("<s>");
   int stop = vocab.lookup_word("</s>");
-  
+
   // convert words to ints
   std::vector<int> nums;
   if (numberize) {
@@ -152,9 +151,9 @@ inline void preprocessWords(const std::vector<std::string> &words,
   else {
     for (int j=0; j<words.size(); j++) {
       nums.push_back(boost::lexical_cast<int>(words[j]));
-    }            
+    }
   }
-  
+
   // convert sequence to n-grams
   ngrams.clear();
   if (ngramize) {
@@ -168,10 +167,10 @@ inline void preprocessWords(const std::vector<std::string> &words,
   }
   else {
     if (nums.size() != ngram_size)
-      {
-	std::cerr << "error: wrong number of fields in line" << std::endl;
-	std::exit(1);
-      }
+    {
+      std::cerr << "error: wrong number of fields in line" << std::endl;
+      std::exit(1);
+    }
     ngrams.push_back(nums);
   }
 }
diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h
index ef96488..6386a0f 100644
--- a/src/neuralNetwork.h
+++ b/src/neuralNetwork.h
@@ -3,7 +3,6 @@
 
 #include <vector>
 #include <boost/shared_ptr.hpp>
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "util.h"
@@ -16,191 +15,191 @@ namespace nplm
 
 class neuralNetwork
 {
-protected:
-    boost::shared_ptr<model> m;
+ protected:
+  boost::shared_ptr<model> m;
 
-private:
-    bool normalization;
-    double weight;
+ private:
+  bool normalization;
+  double weight;
 
-    propagator prop;
+  propagator prop;
 
-    std::size_t cache_size;
-    Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
-    std::vector<double> cache_values;
-    int cache_lookups, cache_hits;
+  std::size_t cache_size;
+  Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
+  std::vector<double> cache_values;
+  int cache_lookups, cache_hits;
 
-public:
-    neuralNetwork() 
+ public:
+  neuralNetwork()
       : m(new model()),
         normalization(false),
-	weight(1.),
-	prop(*m, 1),
+        weight(1.),
+        prop(*m, 1),
         cache_size(0)
-    { 
-    }
+  {
+  }
 
-    void set_normalization(bool value) { normalization = value; }
-    void set_log_base(double value) { weight = 1./std::log(value); }
-
-    // This must be called if the underlying model is resized.
-    void resize() {
-	if (cache_size)
-	{
-	  cache_keys.resize(m->ngram_size, cache_size);
-	  cache_keys.fill(-1);
-	}
-	prop.resize();
-    }
+  void set_normalization(bool value) { normalization = value; }
+  void set_log_base(double value) { weight = 1./std::log(value); }
 
-    void set_width(int width)
+  // This must be called if the underlying model is resized.
+  void resize() {
+    if (cache_size)
     {
-	prop.resize(width);
+      cache_keys.resize(m->ngram_size, cache_size);
+      cache_keys.fill(-1);
     }
-
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+    prop.resize();
+  }
+
+  void set_width(int width)
+  {
+    prop.resize(width);
+  }
+
+  template <typename Derived>
+  double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+  {
+    assert (ngram.rows() == m->ngram_size);
+    assert (ngram.cols() == 1);
+
+    std::size_t hash;
+    if (cache_size)
     {
-	assert (ngram.rows() == m->ngram_size);
-	assert (ngram.cols() == 1);
-
-	std::size_t hash;
-	if (cache_size)
-	{
-	    // First look in cache
-	    hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
-	    cache_lookups++;
-	    if (cache_keys.col(hash) == ngram)
-	    {
-	        cache_hits++;
-		return cache_values[hash];
-	    }
-	}
-
-	// Make sure that we're single threaded. Multithreading doesn't help,
-	// and in some cases can hurt quite a lot
-	int save_threads = omp_get_max_threads();
-	omp_set_num_threads(1);
-	int save_eigen_threads = Eigen::nbThreads();
-	Eigen::setNbThreads(1);
-	#ifdef __INTEL_MKL__
-	int save_mkl_threads = mkl_get_max_threads();
-	mkl_set_num_threads(1);
-	#endif
-
-        prop.fProp(ngram.col(0));
-
-	int output = ngram(m->ngram_size-1, 0);
-	double log_prob;
-
-	start_timer(3);
-	if (normalization)
-	{
-	    Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
-            if (prop.skip_hidden)
-                prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-            else
-                prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-	    double logz = logsum(scores.col(0));
-	    log_prob = weight * (scores(output, 0) - logz);
-	}
-	else
-	{
-            if (prop.skip_hidden)
-                log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
-            else
-                log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
-	}
-	stop_timer(3);
-
-	if (cache_size)
-	{
-	    // Update cache
-	    cache_keys.col(hash) = ngram;
-	    cache_values[hash] = log_prob;
-	}
-
-	#ifdef __INTEL_MKL__
-	mkl_set_num_threads(save_mkl_threads);
-	#endif
-	Eigen::setNbThreads(save_eigen_threads);
-	omp_set_num_threads(save_threads);
-
-	return log_prob;
+      // First look in cache
+      hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
+      cache_lookups++;
+      if (cache_keys.col(hash) == ngram)
+      {
+        cache_hits++;
+        return cache_values[hash];
+      }
     }
 
-    // Look up many n-grams in parallel.
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        UNCONST(DerivedB, log_probs_const, log_probs);
-	assert (ngram.rows() == m->ngram_size);
-	//assert (ngram.cols() <= prop.get_minibatch_size());
-
-        prop.fProp(ngram);
-
-	if (normalization)
-	{
-	    Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
-            if (prop.skip_hidden)
-                prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-            else
-                prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-
-	    // And softmax and loss
-	    Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
-	    double minibatch_log_likelihood;
-	    SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
-	    for (int j=0; j<ngram.cols(); j++)
-	    {
-	        int output = ngram(m->ngram_size-1, j);
-		log_probs(0, j) = weight * output_probs(output, j);
-	    }
-	}
-	else
-	{
-	    for (int j=0; j<ngram.cols(); j++)
-	    {
-	        int output = ngram(m->ngram_size-1, j);
-                if (prop.skip_hidden)
-                    log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
-                else
-                    log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
-	    }
-	}
-    }
+    // Make sure that we're single threaded. Multithreading doesn't help,
+    // and in some cases can hurt quite a lot
+    int save_threads = omp_get_max_threads();
+    omp_set_num_threads(1);
+    int save_eigen_threads = Eigen::nbThreads();
+    Eigen::setNbThreads(1);
+#ifdef __INTEL_MKL__
+    int save_mkl_threads = mkl_get_max_threads();
+    mkl_set_num_threads(1);
+#endif
+
+    prop.fProp(ngram.col(0));
 
-    int get_order() const { return m->ngram_size; }
+    int output = ngram(m->ngram_size-1, 0);
+    double log_prob;
 
-    void read(const std::string &filename)
+    start_timer(3);
+    if (normalization)
     {
-        m->read(filename);
-        resize();
-	// this is faster but takes more memory
-        //m->premultiply();
+      Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
+      if (prop.skip_hidden)
+        prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+      else
+        prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+      double logz = logsum(scores.col(0));
+      log_prob = weight * (scores(output, 0) - logz);
     }
-
-    void set_cache(std::size_t cache_size)
+    else
     {
-        this->cache_size = cache_size;
-	cache_keys.resize(m->ngram_size, cache_size);
-	cache_keys.fill(-1); // clears cache
-	cache_values.resize(cache_size);
-	cache_lookups = cache_hits = 0;
+      if (prop.skip_hidden)
+        log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
+      else
+        log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
     }
+    stop_timer(3);
 
-    double cache_hit_rate()
+    if (cache_size)
     {
-        return static_cast<double>(cache_hits)/cache_lookups;
+      // Update cache
+      cache_keys.col(hash) = ngram;
+      cache_values[hash] = log_prob;
     }
 
-    void premultiply()
+#ifdef __INTEL_MKL__
+    mkl_set_num_threads(save_mkl_threads);
+#endif
+    Eigen::setNbThreads(save_eigen_threads);
+    omp_set_num_threads(save_threads);
+
+    return log_prob;
+  }
+
+  // Look up many n-grams in parallel.
+  template <typename DerivedA, typename DerivedB>
+  void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+  {
+    UNCONST(DerivedB, log_probs_const, log_probs);
+    assert (ngram.rows() == m->ngram_size);
+    //assert (ngram.cols() <= prop.get_minibatch_size());
+
+    prop.fProp(ngram);
+
+    if (normalization)
+    {
+      Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
+      if (prop.skip_hidden)
+        prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+      else
+        prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
+      // And softmax and loss
+      Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
+      double minibatch_log_likelihood;
+      SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
+      for (int j=0; j<ngram.cols(); j++)
+      {
+        int output = ngram(m->ngram_size-1, j);
+        log_probs(0, j) = weight * output_probs(output, j);
+      }
+    }
+    else
+    {
+      for (int j=0; j<ngram.cols(); j++)
+      {
+        int output = ngram(m->ngram_size-1, j);
+        if (prop.skip_hidden)
+          log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
+        else
+          log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
+      }
+    }
+  }
+
+  int get_order() const { return m->ngram_size; }
+
+  void read(const std::string &filename)
+  {
+    m->read(filename);
+    resize();
+    // this is faster but takes more memory
+    //m->premultiply();
+  }
+
+  void set_cache(std::size_t cache_size)
+  {
+    this->cache_size = cache_size;
+    cache_keys.resize(m->ngram_size, cache_size);
+    cache_keys.fill(-1); // clears cache
+    cache_values.resize(cache_size);
+    cache_lookups = cache_hits = 0;
+  }
+
+  double cache_hit_rate()
+  {
+    return static_cast<double>(cache_hits)/cache_lookups;
+  }
+
+  void premultiply()
+  {
+    if (!m->premultiplied)
     {
-        if (!m->premultiplied)
-        {
-            m->premultiply();
-        }
+      m->premultiply();
     }
+  }
 
 };
 
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
index a2cac7a..d5fc16b 100644
--- a/src/prepareNeuralLM.cpp
+++ b/src/prepareNeuralLM.cpp
@@ -2,19 +2,19 @@
 #include <vector>
 #include <queue>
 #include <deque>
-# include <fstream>
-# include <iterator>
-
-# include <boost/unordered_map.hpp>
-# include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <fstream>
+#include <iterator>
+
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
 #include <boost/interprocess/containers/vector.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/uniform_int_distribution.hpp>
 
-# include <tclap/CmdLine.h>
+#include <tclap/CmdLine.h>
 
 #include "neuralLM.h"
 #include "util.h"
@@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec;
 typedef long long int data_size_t; // training data can easily exceed 2G instances
 
 template<typename T>
-void writeNgrams(const T &data, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename)
-	{
-    ofstream file(filename.c_str());
-    if (!file)
+void writeNgrams(const T &data,
+                 int ngram_size,
+                 const vocabulary &vocab,
+                 bool numberize,
+                 bool add_start_stop,
+                 bool ngramize,
+                 const string &filename)
+{
+  ofstream file(filename.c_str());
+  if (!file)
+  {
+    cerr << "error: could not open " << filename << endl;
+    exit(1);
+  }
+
+  vector<vector<int> > ngrams;
+
+  for (int i=0; i<data.size(); i++) {
+    preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
+    // write out n-grams
+    for (int j=0; j<ngrams.size(); j++)
     {
-	cerr << "error: could not open " << filename << endl;
-	exit(1);
-    }
-
-    vector<vector<int> > ngrams;
-
-    for (int i=0; i<data.size(); i++) {
-        preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
-	// write out n-grams
-	for (int j=0; j<ngrams.size(); j++)
-	  {
-	    for (int k=0; k<ngram_size; k++)
-	      {
-	        file << ngrams[j][k] << " ";
-	      }
-	    file << endl;
-	  }
+      for (int k=0; k<ngram_size; k++)
+      {
+        file << ngrams[j][k] << " ";
+      }
+      file << endl;
     }
-    file.close();
+  }
+  file.close();
 }
 
 // Space efficient version for writing the n-grams.
 // They are not read into memory.
-void writeNgrams(const string &input_filename, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename,
-     int train_data_size,
-		 vector<float> &sent_weights,
-		 const string &sent_weights_filename)
+void writeNgrams(const string &input_filename,
+                 int ngram_size,
+                 const vocabulary &vocab,
+                 bool numberize,
+                 bool add_start_stop,
+                 bool ngramize,
+                 const string &filename,
+                 int train_data_size,
+                 vector<float> &sent_weights,
+                 const string &sent_weights_filename)
 {
-    ofstream file(filename.c_str());
-    ofstream output_sent_weights_file(sent_weights_filename.c_str());
-    if (!file)
-    {
-      cerr << "error: could not open " << filename << endl;
-      exit(1);
+  ofstream file(filename.c_str());
+  ofstream output_sent_weights_file(sent_weights_filename.c_str());
+  if (!file)
+  {
+    cerr << "error: could not open " << filename << endl;
+    exit(1);
+  }
+
+  ifstream input_file(input_filename.c_str());
+  vector<vector<int> > ngrams;
+  //for (int i=0; i<train_data.size(); i++) {
+  string line;
+  int counter = 0;
+  cerr<<"Processed ... ";
+  while (getline(input_file,line) && train_data_size-- > 0) {
+    counter++;
+    if ((counter % 100000) == 0) {
+      cerr<<counter<<" training lines ... ";
     }
-
-    ifstream input_file(input_filename.c_str());
-    vector<vector<int> > ngrams;
-    //for (int i=0; i<train_data.size(); i++) {
-    string line;
-    int counter = 0;
-    cerr<<"Processed ... ";
-    while (getline(input_file,line) && train_data_size-- > 0) {
-            counter++;
-      if ((counter % 100000) == 0) {
-        cerr<<counter<<" training lines ... ";
-      }
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
 
     //for (int i=0; i<data.size(); i++) {
-      preprocessWords(lstr_items,
-          ngrams,
-          ngram_size,
-          vocab,
-          numberize,
-          add_start_stop,
-          ngramize);
-
-	    // write out n-grams
-	    for (int j=0; j<ngrams.size(); j++)
-	    {
-					if (sent_weights.size() != 0) {
-						output_sent_weights_file <<sent_weights[counter-1]<<endl;
-					}	
-	        for (int k=0; k<ngram_size; k++)
-	        {
-	        file << ngrams[j][k] << " ";
-	        }
-	      file << endl;
-	    }
+    preprocessWords(lstr_items,
+                    ngrams,
+                    ngram_size,
+                    vocab,
+                    numberize,
+                    add_start_stop,
+                    ngramize);
+
+    // write out n-grams
+    for (int j=0; j<ngrams.size(); j++)
+    {
+      if (sent_weights.size() != 0) {
+        output_sent_weights_file <<sent_weights[counter-1]<<endl;
+      }
+      for (int k=0; k<ngram_size; k++)
+      {
+        file << ngrams[j][k] << " ";
+      }
+      file << endl;
     }
-    cerr<<endl;
-    input_file.close();
-    file.close();
-    output_sent_weights_file.close();
+  }
+  cerr<<endl;
+  input_file.close();
+  file.close();
+  output_sent_weights_file.close();
 }
 
 // Space efficient version for writing the n-grams.
 // They are not read into memory.
-void writeMmapNgrams(const string &input_filename, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename,
-     unsigned long train_data_size,
-     data_size_t num_tokens,
-     bool randomize)
+void writeMmapNgrams(const string &input_filename,
+                     int ngram_size,
+                     const vocabulary &vocab,
+                     bool numberize,
+                     bool add_start_stop,
+                     bool ngramize,
+                     const string &filename,
+                     unsigned long train_data_size,
+                     data_size_t num_tokens,
+                     bool randomize)
 {
-    cerr<<"Num tokens is "<<num_tokens<<endl;
-    cerr<<"Training data size is "<<train_data_size<<endl;
-    // Open the memory mapped file and create the allocators
-    ip::managed_mapped_file mfile(ip::create_only,
-        filename.c_str(),
-        num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
-    intAllocator ialloc(mfile.get_segment_manager());
-    vecAllocator valloc (mfile.get_segment_manager());
-    //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
-
-    vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
-
-    cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
-    // Going over every line in the input file and 
-    // printing the memory mapped ngrams into the 
-    // output file
-    ifstream input_file(input_filename.c_str());
-    //for (int i=0; i<train_data.size(); i++) {
-    string line;
-    int counter = 0;
-    cerr<<"Processed ... ";
-    long int train_ngram_counter = 0;
-    vector<vector<int> > ngrams;
-    while (getline(input_file,line) && train_data_size-- > 0) {
-            counter++;
-      if ((counter % 100000) ==0) {
-        //cerr<<"counter is "<<counter<<endl;
-        cerr<<counter<<" training lines ... ";
-      }
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
+  cerr<<"Num tokens is "<<num_tokens<<endl;
+  cerr<<"Training data size is "<<train_data_size<<endl;
+  // Open the memory mapped file and create the allocators
+  ip::managed_mapped_file mfile(ip::create_only,
+                                filename.c_str(),
+                                num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+  intAllocator ialloc(mfile.get_segment_manager());
+  vecAllocator valloc (mfile.get_segment_manager());
+  //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
+
+  vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+  cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
+  // Going over every line in the input file and
+  // printing the memory mapped ngrams into the
+  // output file
+  ifstream input_file(input_filename.c_str());
+  //for (int i=0; i<train_data.size(); i++) {
+  string line;
+  int counter = 0;
+  cerr<<"Processed ... ";
+  long int train_ngram_counter = 0;
+  vector<vector<int> > ngrams;
+  while (getline(input_file,line) && train_data_size-- > 0) {
+    counter++;
+    if ((counter % 100000) ==0) {
+      //cerr<<"counter is "<<counter<<endl;
+      cerr<<counter<<" training lines ... ";
+    }
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
 
     //for (int i=0; i<data.size(); i++) {
-      preprocessWords(lstr_items, ngrams,
-          ngram_size,
-          vocab,
-          numberize, 
-          add_start_stop,
-          ngramize);
-      /*
+    preprocessWords(lstr_items, ngrams,
+                    ngram_size,
+                    vocab,
+                    numberize,
+                    add_start_stop,
+                    ngramize);
+    /*
       cerr<<"line is "<<endl;
       cerr<<line<<endl;
       cerr<<"Number of ngrams is "<<ngrams.size()<<endl;
-        if (ngrams.size() ==1 ){
-          cerr<<"The line number was "<<counter<<endl;
-          cerr<<line<<endl;
+      if (ngrams.size() ==1 ){
+      cerr<<"The line number was "<<counter<<endl;
+      cerr<<line<<endl;
+      }
+    */
+    // write out n-grams in mmapped file
+    for (int j=0; j<ngrams.size(); j++)
+    {
+      /*
+        for (int k=0; k<ngram_size; k++)
+        {
+        cerr << ngrams[j][k] << " ";
         }
+        cerr<< endl;
       */
-	    // write out n-grams in mmapped file
-	    for (int j=0; j<ngrams.size(); j++)
-	    {
-        /*
-       for (int k=0; k<ngram_size; k++)
-	        {
-	        cerr << ngrams[j][k] << " ";
-	        }
-	      cerr<< endl; 
-        */
-        for (int k=0; k<ngram_size; k++) {
-          mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
-        }
-        train_ngram_counter++;
-        //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
-	    }
+      for (int k=0; k<ngram_size; k++) {
+        mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
+      }
+      train_ngram_counter++;
+      //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
     }
-    cerr<<endl;
-    input_file.close();
-
-    // Shrink the file if it was overused
-    ip::managed_mapped_file::shrink_to_fit(filename.c_str());
-    //now to randomize the items if the randomize flag was set
-    if (randomize == true) {
-      unsigned seed = 1234; //for testing only
-      boost::random::mt19937 rng(seed);
-       cerr<<"Randomly shuffling data...";
-        data_size_t counter =0;
-        while (counter < num_tokens) {
-          data_size_t upper_limit = counter+5000000;
-          long int vector_size = 5000000;
-          if (counter + 10000000 >= num_tokens) {
-            upper_limit = num_tokens;
-            vector_size = num_tokens - counter;
-          }
-          vector<int> temp(vector_size*ngram_size,0);
-          for (int i=0;i<vector_size;i++){
-           for (int k=0;k<ngram_size;k++) {
-             temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
-           }
-          }
-          for (data_size_t i=vector_size-1; i>0; i--)
-          {
-            if (i %500000 == 0) {
-              cerr<<"Shuffled "<<num_tokens-1<<" instances...";
-            }
-            data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
-            for (int k=0;k<ngram_size;k++) {
-              int temp_val = temp.at(i*ngram_size+k);
-              temp.at(i*ngram_size+k) =
-                temp.at(j*ngram_size+k);
-              temp.at(j*ngram_size+k) = temp_val;
-            }
-          }
-          //Putting it back
-          for (int i=0;i<vector_size;i++){
-           for (int k=0;k<ngram_size;k++) {
-             mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
-           }
-          }
-          counter = upper_limit;
+  }
+  cerr<<endl;
+  input_file.close();
+
+  // Shrink the file if it was overused
+  ip::managed_mapped_file::shrink_to_fit(filename.c_str());
+  //now to randomize the items if the randomize flag was set
+  if (randomize == true) {
+    unsigned seed = 1234; //for testing only
+    boost::random::mt19937 rng(seed);
+    cerr<<"Randomly shuffling data...";
+    data_size_t counter =0;
+    while (counter < num_tokens) {
+      data_size_t upper_limit = counter+5000000;
+      long int vector_size = 5000000;
+      if (counter + 10000000 >= num_tokens) {
+        upper_limit = num_tokens;
+        vector_size = num_tokens - counter;
+      }
+      vector<int> temp(vector_size*ngram_size,0);
+      for (int i=0;i<vector_size;i++){
+        for (int k=0;k<ngram_size;k++) {
+          temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
         }
-
-      /*
-      for (data_size_t i=num_tokens-1; i>0; i--)
+      }
+      for (data_size_t i=vector_size-1; i>0; i--)
       {
         if (i %500000 == 0) {
           cerr<<"Shuffled "<<num_tokens-1<<" instances...";
         }
         data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
         for (int k=0;k<ngram_size;k++) {
-          int temp_val = mMapVec->at(i*ngram_size+k);
-          mMapVec->at(i*ngram_size+k) =
-            mMapVec->at(j*ngram_size+k);
-          mMapVec->at(j*ngram_size+k) = temp_val;
+          int temp_val = temp.at(i*ngram_size+k);
+          temp.at(i*ngram_size+k) =
+              temp.at(j*ngram_size+k);
+          temp.at(j*ngram_size+k) = temp_val;
         }
       }
-      */
-    cerr<<endl; 
+      //Putting it back
+      for (int i=0;i<vector_size;i++){
+        for (int k=0;k<ngram_size;k++) {
+          mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
+        }
+      }
+      counter = upper_limit;
     }
+
+    /*
+      for (data_size_t i=num_tokens-1; i>0; i--)
+      {
+      if (i %500000 == 0) {
+      cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+      }
+      data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
+      for (int k=0;k<ngram_size;k++) {
+      int temp_val = mMapVec->at(i*ngram_size+k);
+      mMapVec->at(i*ngram_size+k) =
+      mMapVec->at(j*ngram_size+k);
+      mMapVec->at(j*ngram_size+k) = temp_val;
+      }
+      }
+    */
+    cerr<<endl;
+  }
 }
 
 
 int main(int argc, char *argv[])
 {
-    ios::sync_with_stdio(false);
-    int ngram_size, vocab_size, validation_size;
-    bool numberize, 
-         ngramize,
-         add_start_stop,
-         mmap_file,
-         randomize;
-
-    string train_text,
-           train_file,
-           validation_text,
-           validation_file,
-           words_file,
-           write_words_file,
-					 sent_weights_text,
-					 output_sent_weights_text;
-
-    try
-    {
-	CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
-
-	// The options are printed in reverse order
+  ios::sync_with_stdio(false);
+  int ngram_size, vocab_size, validation_size;
+  bool numberize,
+      ngramize,
+      add_start_stop,
+      mmap_file,
+      randomize;
+
+  string train_text,
+      train_file,
+      validation_text,
+      validation_file,
+      words_file,
+      write_words_file,
+      sent_weights_text,
+      output_sent_weights_text;
+
+  try
+  {
+    CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+    // The options are printed in reverse order
 
     ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is "
-        "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
+                                 "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
 
     ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd);
 
     ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
     ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
     ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
-	ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
+    ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
     ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
-	ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
-	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
-	//ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
-  //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
-
-
-
-	cmd.parse(argc, argv);
-
-	train_text = arg_train_text.getValue();
-	train_file = arg_train_file.getValue();
-	validation_text = arg_validation_text.getValue();
-	validation_file = arg_validation_file.getValue();
-	validation_size = arg_validation_size.getValue();
-	write_words_file = arg_write_words_file.getValue();
-	ngram_size = arg_ngram_size.getValue();
-	vocab_size = arg_vocab_size.getValue();
-	words_file = arg_words_file.getValue();
-	numberize = arg_numberize.getValue();
-	ngramize = arg_ngramize.getValue();
-	add_start_stop = arg_add_start_stop.getValue();
-  mmap_file = arg_mmap_file.getValue();
-  randomize = arg_randomize.getValue();
-  //sent_weights_text = arg_sent_weights_text.getValue();
-  //output_sent_weights_text = arg_sent_weights_file.getValue();
-  sent_weights_text = "";
-  output_sent_weights_text = "";
+    ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+    ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+    //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
+    //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
+
+
+    cmd.parse(argc, argv);
+
+    train_text = arg_train_text.getValue();
+    train_file = arg_train_file.getValue();
+    validation_text = arg_validation_text.getValue();
+    validation_file = arg_validation_file.getValue();
+    validation_size = arg_validation_size.getValue();
+    write_words_file = arg_write_words_file.getValue();
+    ngram_size = arg_ngram_size.getValue();
+    vocab_size = arg_vocab_size.getValue();
+    words_file = arg_words_file.getValue();
+    numberize = arg_numberize.getValue();
+    ngramize = arg_ngramize.getValue();
+    add_start_stop = arg_add_start_stop.getValue();
+    mmap_file = arg_mmap_file.getValue();
+    randomize = arg_randomize.getValue();
+    //sent_weights_text = arg_sent_weights_text.getValue();
+    //output_sent_weights_text = arg_sent_weights_file.getValue();
+    sent_weights_text = "";
+    output_sent_weights_text = "";
 
 
     // check command line arguments
@@ -364,292 +363,292 @@ int main(int argc, char *argv[])
 
     cerr << "Command line: " << endl;
     cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-	
-	const string sep(" Value: ");
-	cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
-	cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
-	cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
-	cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
-	cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
-	cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
-	cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
-	cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
-	cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
-	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
-	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
-	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
-	cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
-	//cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
-    }
-    catch (TCLAP::ArgException &e)
-    {
-      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
-      exit(1);
-    }
 
-    // VLF: why is this true?
-    // DC: it's because the vocabulary has to be constructed from the training data only.
-    // If the vocabulary is preset, we can't create the validation data.
-    // - if --numberize 0 is set, then --validation_size cannot be used.
-    // if (!numberize && (validation_size > 0)) {
-    //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
-    // }
-
-    // Read in training data and validation data
-    // vector<vector<string> > train_data;
-    // readSentFile(train_text, train_data);
-    // @vaswani: No more reading the entire training file into memory
-    // Reading it per line with file io
-    
-    //for (int i=0; i<train_data.size(); i++) {
-    // Go over every line in the file and 
-    // 1. if the !ngramize then you should check if 
-    // we have the correct number of items per line
-    // 2. build the vocabulary if the words file has not
-    // been specified.
-    // Construct vocabulary
-    vocabulary vocab;
-    int start, stop;
-    // Add start stop if the vocabulary has not been supplied
-    if (words_file == "") {
-      vocab.insert_word("<s>");
-	    vocab.insert_word("</s>");
-	    vocab.insert_word("<null>");
-      // warn user that if --numberize is not set, there will be no vocabulary!
-      if (!numberize) {
-          cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
-      }
-    }
-    if (mmap_file == false && randomize == true) {
-      cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
-      exit(1);
+    const string sep(" Value: ");
+    cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
+    cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+    cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
+    cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+    cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+    cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
+    cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+    cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
+    cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
+    cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+    cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+    cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
+    //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
+  }
+  catch (TCLAP::ArgException &e)
+  {
+    cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+    exit(1);
+  }
+
+  // VLF: why is this true?
+  // DC: it's because the vocabulary has to be constructed from the training data only.
+  // If the vocabulary is preset, we can't create the validation data.
+  // - if --numberize 0 is set, then --validation_size cannot be used.
+  // if (!numberize && (validation_size > 0)) {
+  //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
+  // }
+
+  // Read in training data and validation data
+  // vector<vector<string> > train_data;
+  // readSentFile(train_text, train_data);
+  // @vaswani: No more reading the entire training file into memory
+  // Reading it per line with file io
+
+  //for (int i=0; i<train_data.size(); i++) {
+  // Go over every line in the file and
+  // 1. if the !ngramize then you should check if
+  // we have the correct number of items per line
+  // 2. build the vocabulary if the words file has not
+  // been specified.
+  // Construct vocabulary
+  vocabulary vocab;
+  int start, stop;
+  // Add start stop if the vocabulary has not been supplied
+  if (words_file == "") {
+    vocab.insert_word("<s>");
+    vocab.insert_word("</s>");
+    vocab.insert_word("<null>");
+    // warn user that if --numberize is not set, there will be no vocabulary!
+    if (!numberize) {
+      cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
     }
-    unordered_map<string,int> count; // For keeping word counts if no supplied vocab
-
-    deque<vector<string> > validation_data;
-    int train_data_size=0;
-    cerr<<"Processed ... ";
-    data_size_t num_tokens=0;
-    
-    ifstream training(train_text.c_str());
-
-    string line;
-    while (getline(training,line)) {
-      train_data_size++;
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
-      // if data is already ngramized, set/check ngram_size
-      if (!ngramize) {
-          if (ngram_size > 0) {
-              if (ngram_size != lstr_items.size()) {
-                  cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
-              }
-          }
-          // else if --ngram_size has not been specified, set it now
-          else {
-              ngram_size=lstr_items.size();
-          }
+  }
+  if (mmap_file == false && randomize == true) {
+    cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
+    exit(1);
+  }
+  unordered_map<string,int> count; // For keeping word counts if no supplied vocab
+
+  deque<vector<string> > validation_data;
+  int train_data_size=0;
+  cerr<<"Processed ... ";
+  data_size_t num_tokens=0;
+
+  ifstream training(train_text.c_str());
+
+  string line;
+  while (getline(training,line)) {
+    train_data_size++;
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
+    // if data is already ngramized, set/check ngram_size
+    if (!ngramize) {
+      if (ngram_size > 0) {
+        if (ngram_size != lstr_items.size()) {
+          cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+        }
       }
-      if ((train_data_size%100000)==0){
-        cerr<<train_data_size<<" lines ... ";
+      // else if --ngram_size has not been specified, set it now
+      else {
+        ngram_size=lstr_items.size();
       }
-      //break;
-      /*
+    }
+    if ((train_data_size%100000)==0){
+      cerr<<train_data_size<<" lines ... ";
+    }
+    //break;
+    /*
       if (lstr_items.size() ==1) {
-        cerr<<"line :"<<endl;
-        cerr<<line<<endl;
-        cerr<<"The number of items was 1"<<endl;
-        getchar();
-      }
-      */
-      num_tokens += lstr_items.size()+1;
-      if (words_file == "") {
-         for (int j=0; j<lstr_items.size(); j++) {
-              count[lstr_items[j]] += 1; 
-          }
+      cerr<<"line :"<<endl;
+      cerr<<line<<endl;
+      cerr<<"The number of items was 1"<<endl;
+      getchar();
       }
-      // Add to validation set if the validation size
-      // has not been specified
-      if (validation_text == "" && validation_size > 0) {
-        //cerr<<"validation size is "<<validation_data.size()<<endl;
-        if (validation_data.size() == validation_size) {
-          //validation_data.erase(validation_data.begin());
-          validation_data.pop_front();
-        }
-        validation_data.push_back(lstr_items);
+    */
+    num_tokens += lstr_items.size()+1;
+    if (words_file == "") {
+      for (int j=0; j<lstr_items.size(); j++) {
+        count[lstr_items[j]] += 1;
       }
     }
-    cerr<<endl;
-    training.close();
-    //cerr<<"validation size is "<<validation_data.size()<<endl;
-    //getchar();
-    if (validation_data.size() < validation_size) {
-      cerr<<"validation size is "<<validation_data.size()<<endl;
-      cerr << "error: requested validation size is greater than training data size" << endl;
-      exit(1);
+    // Add to validation set if the validation size
+    // has not been specified
+    if (validation_text == "" && validation_size > 0) {
+      //cerr<<"validation size is "<<validation_data.size()<<endl;
+      if (validation_data.size() == validation_size) {
+        //validation_data.erase(validation_data.begin());
+        validation_data.pop_front();
+      }
+      validation_data.push_back(lstr_items);
     }
-    
-    train_data_size -= validation_size; 
-    cerr<<"Training data size is "<<train_data_size<<endl;
-
-    // The items in the validation data have already been counted
-    // Decrementing the counts of those words before building the vocabulary
-    for(int i=0; i<validation_data.size(); i++){
-      num_tokens -= (validation_data[i].size() +1);
-      for (int j=0; j<validation_data[i].size();j++){
-        count[validation_data[i][j]] -= 1;
-        if (count[validation_data[i][j]] == 0) {
-          count.erase(validation_data[i][j]);
-        }
+  }
+  cerr<<endl;
+  training.close();
+  //cerr<<"validation size is "<<validation_data.size()<<endl;
+  //getchar();
+  if (validation_data.size() < validation_size) {
+    cerr<<"validation size is "<<validation_data.size()<<endl;
+    cerr << "error: requested validation size is greater than training data size" << endl;
+    exit(1);
+  }
+
+  train_data_size -= validation_size;
+  cerr<<"Training data size is "<<train_data_size<<endl;
+
+  // The items in the validation data have already been counted
+  // Decrementing the counts of those words before building the vocabulary
+  for(int i=0; i<validation_data.size(); i++){
+    num_tokens -= (validation_data[i].size() +1);
+    for (int j=0; j<validation_data[i].size();j++){
+      count[validation_data[i][j]] -= 1;
+      if (count[validation_data[i][j]] == 0) {
+        count.erase(validation_data[i][j]);
       }
     }
+  }
 
-    // Getting the top n frequent words for the vocabulary
-    if (words_file == "") {
-      vocab.insert_most_frequent(count, vocab_size);
-      if (vocab.size() < vocab_size) {
-          cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
-      }
+  // Getting the top n frequent words for the vocabulary
+  if (words_file == "") {
+    vocab.insert_most_frequent(count, vocab_size);
+    if (vocab.size() < vocab_size) {
+      cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
     }
-    //vector<vector<string> > validation_data;
-    if (validation_text != "") {
-        readSentFile(validation_text, validation_data);
-        for (int i=0; i<validation_data.size(); i++) {
-	    // if data is already ngramized, set/check ngram_size
-            if (!ngramize) {
-                // if --ngram_size has been specified, check that it does not conflict with --ngram_size
-                if (ngram_size > 0) {
-                    if (ngram_size != validation_data[i].size()) {
-                        cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
-                    }
-                }
-                // else if --ngram_size has not been specified, set it now
-                else {
-                    ngram_size=validation_data[i].size();
-                }
-            }
+  }
+  //vector<vector<string> > validation_data;
+  if (validation_text != "") {
+    readSentFile(validation_text, validation_data);
+    for (int i=0; i<validation_data.size(); i++) {
+      // if data is already ngramized, set/check ngram_size
+      if (!ngramize) {
+        // if --ngram_size has been specified, check that it does not conflict with --ngram_size
+        if (ngram_size > 0) {
+          if (ngram_size != validation_data[i].size()) {
+            cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
+          }
         }
+        // else if --ngram_size has not been specified, set it now
+        else {
+          ngram_size=validation_data[i].size();
+        }
+      }
     }
-    //READING SENTENCE WEIGHTS IF THERE ARE ANY
-    vector<float> sent_weights;
-    if (sent_weights_text != "") {
-      cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
-      ifstream sent_weights_file(sent_weights_text.c_str());
-			string line;
-      readWeightsFile(sent_weights_file,sent_weights);
-			sent_weights_file.close();
-			if (sent_weights_text.size() != train_data_size) {
-				cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
-			}
+  }
+  //READING SENTENCE WEIGHTS IF THERE ARE ANY
+  vector<float> sent_weights;
+  if (sent_weights_text != "") {
+    cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
+    ifstream sent_weights_file(sent_weights_text.c_str());
+    string line;
+    readWeightsFile(sent_weights_file,sent_weights);
+    sent_weights_file.close();
+    if (sent_weights_text.size() != train_data_size) {
+      cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
     }
-		
-    /*
+  }
+
+  /*
     else if (validation_size > 0)
     {
-      // Create validation data
-      if (validation_size > train_data.size())
-      {
-          cerr << "error: requested validation size is greater than training data size" << endl;
-          exit(1);
-      }
-	    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
-	    train_data.resize(train_data.size() - validation_size);
+    // Create validation data
+    if (validation_size > train_data.size())
+    {
+    cerr << "error: requested validation size is greater than training data size" << endl;
+    exit(1);
     }
-    */
-
-    // Construct vocabulary
-    //vocabulary vocab;
-    //int start, stop;
-    
-    // read vocabulary from file
-    if (words_file != "") {
-        vector<string> words;
-        readWordsFile(words_file,words);
-        for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
-            vocab.insert_word(*it);
-        }
-
-        // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
-        if (vocab_size > 0) {
-            if (vocab.size() != vocab_size) {
-                cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
-            }
-        }
-        // else, set it to the size of vocabulary read from file
-        else {
-            vocab_size = vocab.size();
-        }
-
+    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+    train_data.resize(train_data.size() - validation_size);
     }
-    /*
-    // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
-    else {
-      vocab.insert_word("<s>");
-	    vocab.insert_word("</s>");
-	    vocab.insert_word("<null>");
-
-        // warn user that if --numberize is not set, there will be no vocabulary!
-        if (!numberize) {
-            cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
-        }
-        unordered_map<string,int> count;
-        for (int i=0; i<train_data.size(); i++) {
-            for (int j=0; j<train_data[i].size(); j++) {
-                count[train_data[i][j]] += 1; 
-            }
-        }
-
-        vocab.insert_most_frequent(count, vocab_size);
-        if (vocab.size() < vocab_size) {
-            cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
-        }
+  */
+
+  // Construct vocabulary
+  //vocabulary vocab;
+  //int start, stop;
+
+  // read vocabulary from file
+  if (words_file != "") {
+    vector<string> words;
+    readWordsFile(words_file,words);
+    for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+      vocab.insert_word(*it);
     }
-    */
 
-    // write vocabulary to file
-    if (write_words_file != "") {
-        cerr << "Writing vocabulary to " << write_words_file << endl;
-        writeWordsFile(vocab.words(), write_words_file);
+    // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+    if (vocab_size > 0) {
+      if (vocab.size() != vocab_size) {
+        cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
+      }
     }
-
-    // Write out numberized n-grams
-    if (train_file != "")
-    {
-        cerr << "Writing training data to " << train_file << endl;
-        if (mmap_file == true) {
-          writeMmapNgrams(train_text,
-            ngram_size,
-            vocab,
-            numberize,
-            add_start_stop,
-            ngramize,
-            train_file,
-            train_data_size,
-            num_tokens,
-            randomize);
-        } else {
-          writeNgrams(train_text,
-              ngram_size,
-              vocab,
-              numberize,
-              add_start_stop,
-              ngramize,
-              train_file,
-              train_data_size,
-							sent_weights,
-							output_sent_weights_text);
-        }
+    // else, set it to the size of vocabulary read from file
+    else {
+      vocab_size = vocab.size();
     }
-    if (validation_file != "")
-    {
-        cerr << "Writing validation data to " << validation_file << endl;
-        writeNgrams(validation_data,
-            ngram_size,
-            vocab,
-            numberize,
-            add_start_stop,
-            ngramize,
-            validation_file);
+
+  }
+  /*
+  // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
+  else {
+  vocab.insert_word("<s>");
+  vocab.insert_word("</s>");
+  vocab.insert_word("<null>");
+
+  // warn user that if --numberize is not set, there will be no vocabulary!
+  if (!numberize) {
+  cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+  }
+  unordered_map<string,int> count;
+  for (int i=0; i<train_data.size(); i++) {
+  for (int j=0; j<train_data[i].size(); j++) {
+  count[train_data[i][j]] += 1;
+  }
+  }
+
+  vocab.insert_most_frequent(count, vocab_size);
+  if (vocab.size() < vocab_size) {
+  cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+  }
+  }
+  */
+
+  // write vocabulary to file
+  if (write_words_file != "") {
+    cerr << "Writing vocabulary to " << write_words_file << endl;
+    writeWordsFile(vocab.words(), write_words_file);
+  }
+
+  // Write out numberized n-grams
+  if (train_file != "")
+  {
+    cerr << "Writing training data to " << train_file << endl;
+    if (mmap_file == true) {
+      writeMmapNgrams(train_text,
+                      ngram_size,
+                      vocab,
+                      numberize,
+                      add_start_stop,
+                      ngramize,
+                      train_file,
+                      train_data_size,
+                      num_tokens,
+                      randomize);
+    } else {
+      writeNgrams(train_text,
+                  ngram_size,
+                  vocab,
+                  numberize,
+                  add_start_stop,
+                  ngramize,
+                  train_file,
+                  train_data_size,
+                  sent_weights,
+                  output_sent_weights_text);
     }
+  }
+  if (validation_file != "")
+  {
+    cerr << "Writing validation data to " << validation_file << endl;
+    writeNgrams(validation_data,
+                ngram_size,
+                vocab,
+                numberize,
+                add_start_stop,
+                ngramize,
+                validation_file);
+  }
 }
diff --git a/src/propagator.h b/src/propagator.h
index 9f214de..6344f2f 100644
--- a/src/propagator.h
+++ b/src/propagator.h
@@ -13,360 +13,359 @@ using Eigen::MatrixBase;
 using Eigen::Dynamic;
 
 class propagator {
-    int minibatch_size;
-    model *pnn;
-
-public:
-    Node<Input_word_embeddings> input_layer_node;
-    Node<Linear_layer> first_hidden_linear_node;
-    Node<Activation_function> first_hidden_activation_node;
-    Node<Linear_layer> second_hidden_linear_node;
-    Node<Activation_function> second_hidden_activation_node;
-    Node<Output_word_embeddings> output_layer_node;
-    bool skip_hidden;
-
-public:
-    propagator () : minibatch_size(0), pnn(0) { }
-
-    propagator (model &nn, int minibatch_size)
+  int minibatch_size;
+  model *pnn;
+
+ public:
+  Node<Input_word_embeddings> input_layer_node;
+  Node<Linear_layer> first_hidden_linear_node;
+  Node<Activation_function> first_hidden_activation_node;
+  Node<Linear_layer> second_hidden_linear_node;
+  Node<Activation_function> second_hidden_activation_node;
+  Node<Output_word_embeddings> output_layer_node;
+  bool skip_hidden;
+
+ public:
+  propagator () : minibatch_size(0), pnn(0) { }
+
+  propagator (model &nn, int minibatch_size)
       :
-        pnn(&nn),
-        input_layer_node(&nn.input_layer, minibatch_size),
-	first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
-	first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
-        second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
-	second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
-	output_layer_node(&nn.output_layer, minibatch_size),
-	minibatch_size(minibatch_size)
-    {
-        skip_hidden = (nn.num_hidden == 0);
-    }
+      pnn(&nn),
+      input_layer_node(&nn.input_layer, minibatch_size),
+      first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
+      first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
+      second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
+      second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
+      output_layer_node(&nn.output_layer, minibatch_size),
+      minibatch_size(minibatch_size)
+  {
+    skip_hidden = (nn.num_hidden == 0);
+  }
 
-    // This must be called if the underlying model is resized.
-    void resize(int minibatch_size) {
-      this->minibatch_size = minibatch_size;
-      input_layer_node.resize(minibatch_size);
-      first_hidden_linear_node.resize(minibatch_size);
-      first_hidden_activation_node.resize(minibatch_size);
-      second_hidden_linear_node.resize(minibatch_size);
-      second_hidden_activation_node.resize(minibatch_size);
-      output_layer_node.resize(minibatch_size);
-    }
+  // This must be called if the underlying model is resized.
+  void resize(int minibatch_size) {
+    this->minibatch_size = minibatch_size;
+    input_layer_node.resize(minibatch_size);
+    first_hidden_linear_node.resize(minibatch_size);
+    first_hidden_activation_node.resize(minibatch_size);
+    second_hidden_linear_node.resize(minibatch_size);
+    second_hidden_activation_node.resize(minibatch_size);
+    output_layer_node.resize(minibatch_size);
+  }
 
-    void resize() { resize(minibatch_size); }
+  void resize() { resize(minibatch_size); }
 
-    template <typename Derived>
-    void fProp(const MatrixBase<Derived> &data)
+  template <typename Derived>
+  void fProp(const MatrixBase<Derived> &data)
+  {
+    if (!pnn->premultiplied)
     {
-        if (!pnn->premultiplied)
-	{
-            start_timer(0);
-	    input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
-	    stop_timer(0);
-	    
-	    start_timer(1);
-	    first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, 
-						  first_hidden_linear_node.fProp_matrix);
-	} 
-	else
-	{
-	    int n_inputs = first_hidden_linear_node.param->n_inputs();
-	    USCMatrix<double> sparse_data;
-	    input_layer_node.param->munge(data, sparse_data);
-
-	    start_timer(1);
-	    first_hidden_linear_node.param->fProp(sparse_data,
-						  first_hidden_linear_node.fProp_matrix);
-	}
-	first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
-						  first_hidden_activation_node.fProp_matrix);
-  //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
-  //std::getchar();
-	stop_timer(1);
-    
-
-        if (!skip_hidden) {
-	start_timer(2);
-	second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
-					       second_hidden_linear_node.fProp_matrix);
-	second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
-						   second_hidden_activation_node.fProp_matrix);
-	stop_timer(2);
-        }
-
-	// The propagation stops here because the last layer is very expensive.
-    }
+      start_timer(0);
+      input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
+      stop_timer(0);
 
-    // Dense version (for standard log-likelihood)
-    template <typename DerivedIn, typename DerivedOut>
-    void bProp(const MatrixBase<DerivedIn> &data,
-	       const MatrixBase<DerivedOut> &output,
-	       double learning_rate,
-         double momentum,
-         double L2_reg,
-         std::string &parameter_update,
-         double conditioning_constant,
-         double decay) 
+      start_timer(1);
+      first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix,
+                                            first_hidden_linear_node.fProp_matrix);
+    }
+    else
     {
-        // Output embedding layer
-
-        start_timer(7);
-        output_layer_node.param->bProp(output,
-				       output_layer_node.bProp_matrix);
-	stop_timer(7);
-	
-	start_timer(8);
-  Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
-  if (parameter_update == "SGD") {
-    output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
-               output,
-               learning_rate,
-               momentum);
-  } else if (parameter_update == "ADA") {
-    output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
-               output,
-               learning_rate);
-  } else if (parameter_update == "ADAD") {
-    //std::cerr<<"Adadelta gradient"<<endl;
-    int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
-    output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
-               output,
-               1.0/current_minibatch_size,
-               conditioning_constant,
-               decay);
-  } else {
-    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
-  }
-	stop_timer(8);
-
-	bPropRest(data, 
-      learning_rate,
-      momentum,
-      L2_reg,
-      parameter_update,
-      conditioning_constant,
-      decay);
+      int n_inputs = first_hidden_linear_node.param->n_inputs();
+      USCMatrix<double> sparse_data;
+      input_layer_node.param->munge(data, sparse_data);
+
+      start_timer(1);
+      first_hidden_linear_node.param->fProp(sparse_data,
+                                            first_hidden_linear_node.fProp_matrix);
+    }
+    first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
+                                              first_hidden_activation_node.fProp_matrix);
+    //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
+    //std::getchar();
+    stop_timer(1);
+
+
+    if (!skip_hidden) {
+      start_timer(2);
+      second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
+                                             second_hidden_linear_node.fProp_matrix);
+      second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
+                                                 second_hidden_activation_node.fProp_matrix);
+      stop_timer(2);
     }
 
-    // Sparse version (for NCE log-likelihood)
-    template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
-    void bProp(const MatrixBase<DerivedIn> &data,
-	       const MatrixBase<DerivedOutI> &samples,
-         const MatrixBase<DerivedOutV> &weights,
-	       double learning_rate,
-         double momentum,
-         double L2_reg,
-         std::string &parameter_update,
-         double conditioning_constant,
-         double decay) 
-    {
+    // The propagation stops here because the last layer is very expensive.
+  }
+
+  // Dense version (for standard log-likelihood)
+  template <typename DerivedIn, typename DerivedOut>
+  void bProp(const MatrixBase<DerivedIn> &data,
+             const MatrixBase<DerivedOut> &output,
+             double learning_rate,
+             double momentum,
+             double L2_reg,
+             std::string &parameter_update,
+             double conditioning_constant,
+             double decay)
+  {
+    // Output embedding layer
+
+    start_timer(7);
+    output_layer_node.param->bProp(output,
+                                   output_layer_node.bProp_matrix);
+    stop_timer(7);
+
+    start_timer(8);
+    Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
+    if (parameter_update == "SGD") {
+      output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
+                                               output,
+                                               learning_rate,
+                                               momentum);
+    } else if (parameter_update == "ADA") {
+      output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
+                                                      output,
+                                                      learning_rate);
+    } else if (parameter_update == "ADAD") {
+      //std::cerr<<"Adadelta gradient"<<endl;
+      int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+      output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
+                                                       output,
+                                                       1.0/current_minibatch_size,
+                                                       conditioning_constant,
+                                                       decay);
+    } else {
+      std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+    }
+    stop_timer(8);
 
-        // Output embedding layer
-
-        start_timer(7);
-        output_layer_node.param->bProp(samples,
-            weights, 
-				    output_layer_node.bProp_matrix);
-	stop_timer(7);
-	
-
-	start_timer(8);
-  Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
-  if (parameter_update == "SGD") {
-    output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
-               samples,
-               weights,
-               learning_rate,
-               momentum);
-  } else if (parameter_update == "ADA") {
-    output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
-               samples,
-               weights,
-               learning_rate);
-  } else if (parameter_update == "ADAD") {
-    int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
-    //std::cerr<<"Adadelta gradient"<<endl;
-    output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
-               samples,
-               weights,
-               1.0/current_minibatch_size,
-               conditioning_constant,
-               decay);
-  } else {
-    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+    bPropRest(data,
+              learning_rate,
+              momentum,
+              L2_reg,
+              parameter_update,
+              conditioning_constant,
+              decay);
   }
 
-	stop_timer(8);
+  // Sparse version (for NCE log-likelihood)
+  template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+  void bProp(const MatrixBase<DerivedIn> &data,
+             const MatrixBase<DerivedOutI> &samples,
+             const MatrixBase<DerivedOutV> &weights,
+             double learning_rate,
+             double momentum,
+             double L2_reg,
+             std::string &parameter_update,
+             double conditioning_constant,
+             double decay)
+  {
 
-	bPropRest(data,
-      learning_rate,
-      momentum,
-      L2_reg,
-      parameter_update,
-      conditioning_constant,
-      decay);
+    // Output embedding layer
+
+    start_timer(7);
+    output_layer_node.param->bProp(samples,
+                                   weights,
+                                   output_layer_node.bProp_matrix);
+    stop_timer(7);
+
+
+    start_timer(8);
+    Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
+    if (parameter_update == "SGD") {
+      output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
+                                               samples,
+                                               weights,
+                                               learning_rate,
+                                               momentum);
+    } else if (parameter_update == "ADA") {
+      output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
+                                                      samples,
+                                                      weights,
+                                                      learning_rate);
+    } else if (parameter_update == "ADAD") {
+      int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+      //std::cerr<<"Adadelta gradient"<<endl;
+      output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
+                                                       samples,
+                                                       weights,
+                                                       1.0/current_minibatch_size,
+                                                       conditioning_constant,
+                                                       decay);
+    } else {
+      std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
     }
 
-private:
-    template <typename DerivedIn>
-    void bPropRest(const MatrixBase<DerivedIn> &data,
-		   double learning_rate, double momentum, double L2_reg,
-       std::string &parameter_update,
-       double conditioning_constant,
-       double decay) 
-    {
-	// Second hidden layer
+    stop_timer(8);
 
+    bPropRest(data,
+              learning_rate,
+              momentum,
+              L2_reg,
+              parameter_update,
+              conditioning_constant,
+              decay);
+  }
 
-  
-  // All the compute gradient functions are together and the backprop
-  // functions are together
-  ////////BACKPROP////////////
-        start_timer(9);
-  if (skip_hidden)
+ private:
+  template <typename DerivedIn>
+  void bPropRest(const MatrixBase<DerivedIn> &data,
+                 double learning_rate, double momentum, double L2_reg,
+                 std::string &parameter_update,
+                 double conditioning_constant,
+                 double decay)
   {
-        start_timer(9);
-        first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+    // Second hidden layer
+
+
+
+    // All the compute gradient functions are together and the backprop
+    // functions are together
+    ////////BACKPROP////////////
+    start_timer(9);
+    if (skip_hidden)
+    {
+      start_timer(9);
+      first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
                                                 first_hidden_activation_node.bProp_matrix,
                                                 first_hidden_linear_node.fProp_matrix,
                                                 first_hidden_activation_node.fProp_matrix);
 
-        first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
-                                                first_hidden_linear_node.bProp_matrix);
-        stop_timer(9);
+      first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+                                            first_hidden_linear_node.bProp_matrix);
+      stop_timer(9);
 
-  }
-  else
-  {
-        second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
-                                           second_hidden_activation_node.bProp_matrix,
-                                           second_hidden_linear_node.fProp_matrix,
-                                           second_hidden_activation_node.fProp_matrix);
+    }
+    else
+    {
+      second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+                                                 second_hidden_activation_node.bProp_matrix,
+                                                 second_hidden_linear_node.fProp_matrix,
+                                                 second_hidden_activation_node.fProp_matrix);
 
 
-	second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
-					       second_hidden_linear_node.bProp_matrix);
-	stop_timer(9);
+      second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
+                                             second_hidden_linear_node.bProp_matrix);
+      stop_timer(9);
 
-	start_timer(11);
-	first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
-						  first_hidden_activation_node.bProp_matrix,
-						  first_hidden_linear_node.fProp_matrix,
-						  first_hidden_activation_node.fProp_matrix);
+      start_timer(11);
+      first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
+                                                first_hidden_activation_node.bProp_matrix,
+                                                first_hidden_linear_node.fProp_matrix,
+                                                first_hidden_activation_node.fProp_matrix);
 
-        first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
-					      first_hidden_linear_node.bProp_matrix);
-	stop_timer(11);
-  }
-  //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
-  //std::getchar();
-  ////COMPUTE GRADIENT/////////
-  if (parameter_update == "SGD") {
-    if (!skip_hidden)
-    {
-    start_timer(10);
-    second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
-                 first_hidden_activation_node.fProp_matrix,
-                 learning_rate,
-                 momentum,
-                 L2_reg);
-    stop_timer(10);
+      first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+                                            first_hidden_linear_node.bProp_matrix);
+      stop_timer(11);
     }
-
-    // First hidden layer
-
-    
-    start_timer(12);
-    first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
-                input_layer_node.fProp_matrix,
-                learning_rate, momentum, L2_reg);
-    stop_timer(12);
-
-    // Input word embeddings
-    
-    start_timer(13);
-    input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
-              data,
-              learning_rate, momentum, L2_reg);
-    stop_timer(13);
-  } else if (parameter_update == "ADA") {
-    if (!skip_hidden)
-    {
-    start_timer(10);
-    second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
-                 first_hidden_activation_node.fProp_matrix,
-                 learning_rate,
-                 L2_reg);
-    stop_timer(10);
+    //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
+    //std::getchar();
+    ////COMPUTE GRADIENT/////////
+    if (parameter_update == "SGD") {
+      if (!skip_hidden)
+      {
+        start_timer(10);
+        second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
+                                                         first_hidden_activation_node.fProp_matrix,
+                                                         learning_rate,
+                                                         momentum,
+                                                         L2_reg);
+        stop_timer(10);
+      }
+
+      // First hidden layer
+
+
+      start_timer(12);
+      first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
+                                                      input_layer_node.fProp_matrix,
+                                                      learning_rate, momentum, L2_reg);
+      stop_timer(12);
+
+      // Input word embeddings
+
+      start_timer(13);
+      input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
+                                              data,
+                                              learning_rate, momentum, L2_reg);
+      stop_timer(13);
+    } else if (parameter_update == "ADA") {
+      if (!skip_hidden)
+      {
+        start_timer(10);
+        second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
+                                                                first_hidden_activation_node.fProp_matrix,
+                                                                learning_rate,
+                                                                L2_reg);
+        stop_timer(10);
+      }
+
+      // First hidden layer
+
+
+      start_timer(12);
+      first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
+                                                             input_layer_node.fProp_matrix,
+                                                             learning_rate,
+                                                             L2_reg);
+      stop_timer(12);
+
+      // Input word embeddings
+
+      start_timer(13);
+      input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
+                                                     data,
+                                                     learning_rate,
+                                                     L2_reg);
+      stop_timer(13);
+    } else if (parameter_update == "ADAD") {
+      int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
+      //std::cerr<<"Adadelta gradient"<<endl;
+      if (!skip_hidden)
+      {
+        start_timer(10);
+        second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
+                                                                 first_hidden_activation_node.fProp_matrix,
+                                                                 1.0/current_minibatch_size,
+                                                                 L2_reg,
+                                                                 conditioning_constant,
+                                                                 decay);
+        stop_timer(10);
+      }
+      //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
+
+      // First hidden layer
+
+
+      start_timer(12);
+      first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
+                                                              input_layer_node.fProp_matrix,
+                                                              1.0/current_minibatch_size,
+                                                              L2_reg,
+                                                              conditioning_constant,
+                                                              decay);
+      stop_timer(12);
+
+      //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
+      // Input word embeddings
+
+      start_timer(13);
+      input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
+                                                      data,
+                                                      1.0/current_minibatch_size,
+                                                      L2_reg,
+                                                      conditioning_constant,
+                                                      decay);
+      stop_timer(13);
+
+      //std::cerr<<"Finished gradient for first input layer"<<std::endl;
+    } else {
+      std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
     }
 
-    // First hidden layer
-
-    
-    start_timer(12);
-    first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
-                input_layer_node.fProp_matrix,
-                learning_rate,
-                L2_reg);
-    stop_timer(12);
-
-    // Input word embeddings
-     
-    start_timer(13);
-    input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
-              data,
-              learning_rate, 
-              L2_reg);
-    stop_timer(13);
-  } else if (parameter_update == "ADAD") {
-    int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
-    //std::cerr<<"Adadelta gradient"<<endl;
-    if (!skip_hidden)
-    {
-    start_timer(10);
-    second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
-                 first_hidden_activation_node.fProp_matrix,
-                 1.0/current_minibatch_size,
-                 L2_reg,
-                 conditioning_constant,
-                 decay);
-    stop_timer(10);
-    }
-    //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
-
-    // First hidden layer
-
-    
-    start_timer(12);
-    first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
-                input_layer_node.fProp_matrix,
-                1.0/current_minibatch_size,
-                L2_reg,
-                conditioning_constant,
-                decay);
-    stop_timer(12);
-
-    //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
-    // Input word embeddings
-     
-    start_timer(13);
-    input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
-              data,
-              1.0/current_minibatch_size, 
-              L2_reg,
-              conditioning_constant,
-              decay);
-    stop_timer(13);
-  
-    //std::cerr<<"Finished gradient for first input layer"<<std::endl;
-  } else {
-    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
   }
-
-    }
 };
 
 } // namespace nplm
 
 #endif
-
diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp
index 4f3713d..abaab34 100644
--- a/src/testNeuralLM.cpp
+++ b/src/testNeuralLM.cpp
@@ -6,7 +6,6 @@
 #include <tclap/CmdLine.h>
 
 #include <Eigen/Core>
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "param.h"
@@ -21,174 +20,174 @@ using namespace Eigen;
 using namespace nplm;
 
 void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams,
-	   vector<double> &out) {
-    if (ngrams.size() == 0) return;
-    int ngram_size = ngrams[0].size();
-
-    if (minibatch_size == 0)
+           vector<double> &out) {
+  if (ngrams.size() == 0) return;
+  int ngram_size = ngrams[0].size();
+
+  if (minibatch_size == 0)
+  {
+    // Score one n-gram at a time. This is how the LM would be queried from a decoder.
+    for (int sent_id=0; sent_id<start.size()-1; sent_id++)
     {
-        // Score one n-gram at a time. This is how the LM would be queried from a decoder.
-        for (int sent_id=0; sent_id<start.size()-1; sent_id++)
-	{	  
-	    double sent_log_prob = 0.0;
-	    for (int j=start[sent_id]; j<start[sent_id+1]; j++) 
-	        sent_log_prob += lm.lookup_ngram(ngrams[j]);
-	    out.push_back(sent_log_prob);
-	}
+      double sent_log_prob = 0.0;
+      for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+        sent_log_prob += lm.lookup_ngram(ngrams[j]);
+      out.push_back(sent_log_prob);
     }
-    else
+  }
+  else
+  {
+    // Score a whole minibatch at a time.
+    Matrix<double,1,Dynamic> log_probs(ngrams.size());
+
+    Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
+    minibatch.setZero();
+    for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
     {
-	// Score a whole minibatch at a time.
-        Matrix<double,1,Dynamic> log_probs(ngrams.size());
-
-        Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
-	minibatch.setZero();
-        for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
-	{
-	    int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
-	    for (int j=0; j<current_minibatch_size; j++)
-	        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
-	    lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
-	}
-
-	for (int sent_id=0; sent_id<start.size()-1; sent_id++)
-	{
-	    double sent_log_prob = 0.0;
-	    for (int j=start[sent_id]; j<start[sent_id+1]; j++)
-	        sent_log_prob += log_probs[j];
-	    out.push_back(sent_log_prob);
-	}
+      int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
+      for (int j=0; j<current_minibatch_size; j++)
+        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
+      lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
     }
+
+    for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+    {
+      double sent_log_prob = 0.0;
+      for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+        sent_log_prob += log_probs[j];
+      out.push_back(sent_log_prob);
+    }
+  }
 }
 
-int main (int argc, char *argv[]) 
+int main (int argc, char *argv[])
 {
-    param myParam;
-    bool normalization;
-    bool numberize, ngramize, add_start_stop;
+  param myParam;
+  bool normalization;
+  bool numberize, ngramize, add_start_stop;
 
-    try {
-      // program options //
-      CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
+  try {
+    // program options //
+    CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
 
-      ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
-      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
+    ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
+    ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
 
-      ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
-      ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
-      ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
 
-      ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+    ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
 
-      ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
+    ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
 
-      ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
+    ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
 
-      cmd.parse(argc, argv);
+    cmd.parse(argc, argv);
 
-      myParam.model_file = arg_model_file.getValue();
-      myParam.test_file = arg_test_file.getValue();
+    myParam.model_file = arg_model_file.getValue();
+    myParam.test_file = arg_test_file.getValue();
 
-      normalization = arg_normalization.getValue();
-      numberize = arg_numberize.getValue();
-      ngramize = arg_ngramize.getValue();
-      add_start_stop = arg_add_start_stop.getValue();
+    normalization = arg_normalization.getValue();
+    numberize = arg_numberize.getValue();
+    ngramize = arg_ngramize.getValue();
+    add_start_stop = arg_add_start_stop.getValue();
 
-      myParam.minibatch_size = minibatch_size.getValue();
-      myParam.num_threads = num_threads.getValue();
+    myParam.minibatch_size = minibatch_size.getValue();
+    myParam.num_threads = num_threads.getValue();
 
-      cerr << "Command line: " << endl;
-      cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-      
-      const string sep(" Value: ");
-      cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
-      cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
+    cerr << "Command line: " << endl;
+    cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
 
-      cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
-      cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
-      cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    const string sep(" Value: ");
+    cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
+    cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
 
-      cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
-      cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
-    }
-    catch (TCLAP::ArgException &e)
-    {
-      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
-      exit(1);
-    }
+    cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
+    cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+    cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
 
-    myParam.num_threads = setup_threads(myParam.num_threads);
+    cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
+    cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
+  }
+  catch (TCLAP::ArgException &e)
+  {
+    cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+    exit(1);
+  }
 
-    ///// Create language model
+  myParam.num_threads = setup_threads(myParam.num_threads);
 
-    neuralLM lm;
-    lm.read(myParam.model_file);
-    lm.set_normalization(normalization);
-    lm.set_log_base(10);
-    lm.set_cache(1048576);
-    int ngram_size = lm.get_order();
-    int minibatch_size = myParam.minibatch_size;
-    if (minibatch_size)
-        lm.set_width(minibatch_size);
+  ///// Create language model
 
-    ///// Read test data
-
-    ifstream test_file(myParam.test_file.c_str());
-    if (!test_file)
-    {
-	cerr << "error: could not open " << myParam.test_file << endl;
-	exit(1);
-    }
-    string line;
+  neuralLM lm;
+  lm.read(myParam.model_file);
+  lm.set_normalization(normalization);
+  lm.set_log_base(10);
+  lm.set_cache(1048576);
+  int ngram_size = lm.get_order();
+  int minibatch_size = myParam.minibatch_size;
+  if (minibatch_size)
+    lm.set_width(minibatch_size);
 
-    vector<int> start;
-    vector<vector<int> > ngrams;
+  ///// Read test data
 
-    while (getline(test_file, line))
-    {
-        vector<string> words;
-        splitBySpace(line, words);
+  ifstream test_file(myParam.test_file.c_str());
+  if (!test_file)
+  {
+    cerr << "error: could not open " << myParam.test_file << endl;
+    exit(1);
+  }
+  string line;
 
-	vector<vector<int> > sent_ngrams;
-	preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
+  vector<int> start;
+  vector<vector<int> > ngrams;
 
-	start.push_back(ngrams.size());
-	copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
-    }
-    start.push_back(ngrams.size());
+  while (getline(test_file, line))
+  {
+    vector<string> words;
+    splitBySpace(line, words);
 
-    int num_threads = 1;
-    vector< vector<double> > sent_log_probs(num_threads);
+    vector<vector<int> > sent_ngrams;
+    preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
 
-    /*
-    // Test thread safety
-    boost::thread_group tg;
-    for (int t=0; t < num_threads; t++) {
-      tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
-    }
-    tg.join_all();
-    */
-    score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
-
-    vector<double> log_likelihood(num_threads);
-    std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
-    for (int i=0; i<sent_log_probs[0].size(); i++) {
-        for (int t=0; t<num_threads; t++)
-	    cout << sent_log_probs[t][i] << "\t";
-	cout << endl;
-        for (int t=0; t<num_threads; t++)
-	log_likelihood[t] += sent_log_probs[t][i];
-    }
-    
-    cerr << "Test log10-likelihood: ";
+    start.push_back(ngrams.size());
+    copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
+  }
+  start.push_back(ngrams.size());
+
+  int num_threads = 1;
+  vector< vector<double> > sent_log_probs(num_threads);
+
+  /*
+  // Test thread safety
+  boost::thread_group tg;
+  for (int t=0; t < num_threads; t++) {
+  tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
+  }
+  tg.join_all();
+  */
+  score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
+
+  vector<double> log_likelihood(num_threads);
+  std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
+  for (int i=0; i<sent_log_probs[0].size(); i++) {
     for (int t=0; t<num_threads; t++)
-      cerr << log_likelihood[t] << " ";
-    cerr << endl;
-    #ifdef USE_CHRONO
-    cerr << "Propagation times:";
-    for (int i=0; i<timer.size(); i++)
-      cerr << " " << timer.get(i);
-    cerr << endl;
-    #endif
-    
+      cout << sent_log_probs[t][i] << "\t";
+    cout << endl;
+    for (int t=0; t<num_threads; t++)
+      log_likelihood[t] += sent_log_probs[t][i];
+  }
+
+  cerr << "Test log10-likelihood: ";
+  for (int t=0; t<num_threads; t++)
+    cerr << log_likelihood[t] << " ";
+  cerr << endl;
+#ifdef USE_CHRONO
+  cerr << "Propagation times:";
+  for (int i=0; i<timer.size(); i++)
+    cerr << " " << timer.get(i);
+  cerr << endl;
+#endif
+
 }
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
index 63ee27d..d4720ef 100644
--- a/src/trainNeuralNetwork.cpp
+++ b/src/trainNeuralNetwork.cpp
@@ -6,17 +6,16 @@
 #include <vector>
 #include <algorithm>
 
-#include <boost/unordered_map.hpp> 
+#include <boost/unordered_map.hpp>
 #include <boost/functional.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
 #include <boost/interprocess/containers/vector.hpp>
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 #include <Eigen/Sparse>
 #include "maybe_omp.h"
@@ -29,7 +28,6 @@
 #include "graphClasses.h"
 #include "util.h"
 #include "multinomial.h"
-//#include "gradientCheck.h"
 
 //#define EIGEN_DONT_PARALLELIZE
 
@@ -65,7 +63,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
         int validation_minibatch_start_index = validation_minibatch_size * validation_batch;
         int current_minibatch_size = min(validation_minibatch_size,
                                           validation_data_size - validation_minibatch_start_index);
-        minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, 
+        minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index,
                                                                                 current_minibatch_size);
         prop_validation.fProp(minibatch.topRows(ngram_size-1));
 
@@ -80,7 +78,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
         // And softmax and loss. Be careful of short minibatch
         double minibatch_log_likelihood;
         start_timer(5);
-        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
+        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
                                 minibatch.row(ngram_size-1),
                                 output_probs,
                                 minibatch_log_likelihood);
@@ -93,7 +91,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
 
     // If the validation perplexity decreases, halve the learning rate.
     if (current_validation_ll != 0.0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA")
-    { 
+    {
         current_learning_rate /= 2;
     }
     current_validation_ll = log_likelihood;
@@ -101,7 +99,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
 
 
 int main(int argc, char** argv)
-{ 
+{
     ios::sync_with_stdio(false);
     bool use_mmap_file, randomize;
     param myParam;
@@ -183,7 +181,7 @@ int main(int argc, char** argv)
       myParam.input_words_file = input_words_file.getValue();
       myParam.output_words_file = output_words_file.getValue();
       if (words_file.getValue() != "")
-	      myParam.input_words_file = myParam.output_words_file = words_file.getValue();
+        myParam.input_words_file = myParam.output_words_file = words_file.getValue();
 
       myParam.model_prefix = model_prefix.getValue();
 
@@ -192,7 +190,7 @@ int main(int argc, char** argv)
       myParam.input_vocab_size = input_vocab_size.getValue();
       myParam.output_vocab_size = output_vocab_size.getValue();
       if (vocab_size.getValue() > 0) {
-	      myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
+        myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
       }
       myParam.num_hidden = num_hidden.getValue();
       myParam.activation_function = activation_function.getValue();
@@ -205,7 +203,7 @@ int main(int argc, char** argv)
       myParam.input_embedding_dimension = input_embedding_dimension.getValue();
       myParam.output_embedding_dimension = output_embedding_dimension.getValue();
       if (embedding_dimension.getValue() >= 0) {
-	      myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
+        myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
       }
 
       myParam.minibatch_size = minibatch_size.getValue();
@@ -243,33 +241,33 @@ int main(int argc, char** argv)
 
       if (embedding_dimension.getValue() >= 0)
       {
-	      cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
+        cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
       }
       else
       {
-	      cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
-	      cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
+        cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
+        cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
       }
       cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl;
       if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue())
       {
-	      cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
-	      exit(1);
+        cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
+        exit(1);
       }
 
       cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl;
 
       if (string_to_activation_function(activation_function.getValue()) == InvalidFunction)
       {
-	      cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
-	      exit(1);
+        cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
+        exit(1);
       }
       cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl;
 
       if (string_to_loss_function(loss_function.getValue()) == InvalidLoss)
       {
-	      cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
-	      exit(1);
+        cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
+        exit(1);
       }
       cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl;
 
@@ -279,7 +277,7 @@ int main(int argc, char** argv)
       cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl;
       cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
       if (myParam.validation_file != "") {
-	     cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
+       cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
       }
       cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl;
       cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl;
@@ -288,7 +286,7 @@ int main(int argc, char** argv)
 
       cerr << normalization.getDescription() << sep << normalization.getValue() << endl;
       if (myParam.normalization){
-	      cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
+        cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
       }
 
       cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl;
@@ -302,7 +300,7 @@ int main(int argc, char** argv)
 
       if (unigram_probs_file.getValue() != "")
       {
-	      cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
+        cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
       }
     }
     catch (TCLAP::ArgException &e)
@@ -337,7 +335,7 @@ int main(int argc, char** argv)
       training_data_flat_mmap = mmap_file.find<vec>("vector").first;
       cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl;
       training_data_size = training_data_flat_mmap->size()/myParam.ngram_size;
-      //randomly shuffle the data for better learning. The shuffling will 
+      //randomly shuffle the data for better learning. The shuffling will
       //be different for a standard stl vector
       // Randomly shuffle training data to improve learning
       if (randomize == true) {
@@ -413,10 +411,10 @@ int main(int argc, char** argv)
     //cerr<<"Num tokens "<<num_tokens<<endl;
     //data_size_t training_data_size = num_tokens / myParam.ngram_size;
     cerr << "Number of training instances: "<< training_data_size << endl;
-    
+
     Matrix<int,Dynamic,Dynamic> training_data;
     //(training_data_flat.data(), myParam.ngram_size, training_data_size);
-    
+
     #ifdef MAP
     cerr<<"Setting up eigen map"<<endl;
     if (use_mmap_file == false) {
@@ -425,11 +423,11 @@ int main(int argc, char** argv)
       training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size);
     }
     cerr<<"Created eigen map"<<endl;
-    #else 
+    #else
     if (use_mmap_file == false) {
       training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size);
     }
-    #endif 
+    #endif
     // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index
     if (myParam.input_vocab_size == 0 and myParam.input_words_file == "")
     {
@@ -454,7 +452,7 @@ int main(int argc, char** argv)
     // Read validation data
     vector<int> validation_data_flat;
     int validation_data_size = 0;
-    
+
     if (myParam.validation_file != "")
     {
       readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat);
@@ -470,16 +468,16 @@ int main(int argc, char** argv)
     if (myParam.input_words_file != "")
     {
         readWordsFile(myParam.input_words_file, input_words);
-	if (myParam.input_vocab_size == 0)
-	    myParam.input_vocab_size = input_words.size();
+  if (myParam.input_vocab_size == 0)
+      myParam.input_vocab_size = input_words.size();
     }
 
     vector<string> output_words;
     if (myParam.output_words_file != "")
     {
         readWordsFile(myParam.output_words_file, output_words);
-	if (myParam.output_vocab_size == 0)
-	    myParam.output_vocab_size = output_words.size();
+  if (myParam.output_vocab_size == 0)
+      myParam.output_vocab_size = output_words.size();
     }
 
     ///// Construct unigram model and sampler that will be used for NCE
@@ -491,17 +489,17 @@ int main(int argc, char** argv)
         if (use_mmap_file == false) {
           output_word = training_data(myParam.ngram_size-1, train_id);
         } else {
-	      //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
+        //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
           output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1);
         }
-		//cerr<<"output word is "<<output_word<<endl;
-	    unigram_counts[output_word] += 1;
+    //cerr<<"output word is "<<output_word<<endl;
+      unigram_counts[output_word] += 1;
     }
     multinomial<data_size_t> unigram (unigram_counts);
 
     ///// Create and initialize the neural network and associated propagators.
     model nn;
-    // IF THE MODEL FILE HAS BEEN DEFINED, THEN 
+    // IF THE MODEL FILE HAS BEEN DEFINED, THEN
     // LOAD THE NEURAL NETWORK MODEL
     if (myParam.model_file != ""){
       nn.read(myParam.model_file);
@@ -529,7 +527,7 @@ int main(int argc, char** argv)
     SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram);
     // normalization parameters
     vector_map c_h, c_h_running_gradient;
-    
+
     ///////////////////////TRAINING THE NEURAL NETWORK////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////
 
@@ -540,8 +538,8 @@ int main(int argc, char** argv)
     if (validation_data_size > 0)
     {
         num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1;
-	cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
-    } 
+  cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
+    }
 
     double current_momentum = myParam.initial_momentum;
     double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1);
@@ -568,36 +566,36 @@ int main(int argc, char** argv)
     }
 
     for (int epoch=0; epoch<myParam.num_epochs; epoch++)
-    { 
+    {
         cerr << "Epoch " << epoch+1 << endl;
         cerr << "Current learning rate: " << current_learning_rate << endl;
 
-        if (myParam.use_momentum) 
-	    cerr << "Current momentum: " << current_momentum << endl;
-	else
+        if (myParam.use_momentum)
+      cerr << "Current momentum: " << current_momentum << endl;
+  else
             current_momentum = -1;
 
-	cerr << "Training minibatches: ";
+  cerr << "Training minibatches: ";
 
-	double log_likelihood = 0.0;
+  double log_likelihood = 0.0;
 
-	int num_samples = 0;
-	if (loss_function == LogLoss)
-	    num_samples = output_vocab_size;
-	else if (loss_function == NCELoss)
-	    num_samples = 1+num_noise_samples;
+  int num_samples = 0;
+  if (loss_function == LogLoss)
+      num_samples = output_vocab_size;
+  else if (loss_function == NCELoss)
+      num_samples = 1+num_noise_samples;
 
-	Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
-	Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
-	Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
-	Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
+  Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
+  Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
+  Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
+  Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
 
         for(data_size_t batch=0;batch<num_batches;batch++)
         {
             if (batch > 0 && batch % 10000 == 0)
             {
-	        cerr << batch <<"...";
-            } 
+          cerr << batch <<"...";
+            }
 
             if (batch > 0 && batch % 500000 == 0)
             {
@@ -605,31 +603,31 @@ int main(int argc, char** argv)
                 compute_validation_perplexity(ngram_size, output_vocab_size, validation_minibatch_size, validation_data_size, num_validation_batches, myParam, prop_validation, validation_data, current_learning_rate, current_validation_ll);
                 cerr << "Current learning rate: " << current_learning_rate << endl;
             }
-   
+
             data_size_t minibatch_start_index = minibatch_size * batch;
 
       int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index);
       #ifdef MAP
-	    Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
-      #else 
+      Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+      #else
       //ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file
-	    Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
-		//cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
-		//cerr<<"Minibatch size "<<current_minibatch_size<<endl;
+      Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+    //cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
+    //cerr<<"Minibatch size "<<current_minibatch_size<<endl;
             if (use_mmap_file == true) {
             minibatch.setZero(ngram_size,current_minibatch_size);
             //now reading the ngrams from the mmaped file
               for (int k=0; k<ngram_size; k++){
                 for (data_size_t index = 0 ; index<current_minibatch_size; index++) {
-				  data_size_t current_index = index + minibatch_start_index;
-				  //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
+          data_size_t current_index = index + minibatch_start_index;
+          //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
                   minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k);
                 }
               }
             } else {
               minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
             }
-      #endif 
+      #endif
             double adjusted_learning_rate = current_learning_rate/minibatch_size;
             //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl;
 
@@ -648,20 +646,20 @@ int main(int argc, char** argv)
 
             prop.fProp(minibatch.topRows(ngram_size-1));
 
-	    if (loss_function == NCELoss)
-	    {
-	      ///// Noise-contrastive estimation
+      if (loss_function == NCELoss)
+      {
+        ///// Noise-contrastive estimation
 
-	      // Generate noise samples. Gather positive and negative samples into matrix.
+        // Generate noise samples. Gather positive and negative samples into matrix.
 
-	      start_timer(3);
+        start_timer(3);
 
         minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1);
-        
+
         for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++)
             for (int train_id = 0; train_id < current_minibatch_size; train_id++)
                 minibatch_samples(sample_id, train_id) = unigram.sample(rng);
-          
+
         stop_timer(3);
 
         // Final forward propagation step (sparse)
@@ -686,7 +684,7 @@ int main(int argc, char** argv)
 
         double minibatch_log_likelihood;
         start_timer(5);
-        softmax_loss.fProp(scores.leftCols(current_minibatch_size), 
+        softmax_loss.fProp(scores.leftCols(current_minibatch_size),
                minibatch_samples,
                probs, minibatch_log_likelihood);
         stop_timer(5);
@@ -697,9 +695,9 @@ int main(int argc, char** argv)
         start_timer(6);
         softmax_loss.bProp(probs, minibatch_weights);
         stop_timer(6);
-        
+
         // Update the normalization parameters
-        
+
         if (myParam.normalization)
         {
           for (int train_id = 0;train_id < current_minibatch_size;train_id++)
@@ -711,19 +709,19 @@ int main(int argc, char** argv)
 
         // Be careful of short minibatch
         prop.bProp(minibatch.topRows(ngram_size-1),
-             minibatch_samples.leftCols(current_minibatch_size), 
+             minibatch_samples.leftCols(current_minibatch_size),
              minibatch_weights.leftCols(current_minibatch_size),
-             adjusted_learning_rate, 
+             adjusted_learning_rate,
              current_momentum,
              myParam.L2_reg,
              myParam.parameter_update,
              myParam.conditioning_constant,
              myParam.decay);
-	    }
-	    else if (loss_function == LogLoss)
-	    {
-	      ///// Standard log-likelihood
-	      start_timer(4);
+      }
+      else if (loss_function == LogLoss)
+      {
+        ///// Standard log-likelihood
+        start_timer(4);
         if (prop.skip_hidden)
             prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
         else
@@ -732,21 +730,21 @@ int main(int argc, char** argv)
 
         double minibatch_log_likelihood;
         start_timer(5);
-        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
-                   minibatch.row(ngram_size-1), 
-                   probs, 
+        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
+                   minibatch.row(ngram_size-1),
+                   probs,
                    minibatch_log_likelihood);
         stop_timer(5);
         log_likelihood += minibatch_log_likelihood;
 
         ///// Backward propagation
-        
+
         start_timer(6);
-        SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), 
-                   probs.leftCols(current_minibatch_size), 
+        SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size),
+                   probs.leftCols(current_minibatch_size),
                    minibatch_weights);
         stop_timer(6);
-        
+
         prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size),
              minibatch_weights,
              adjusted_learning_rate,
@@ -757,33 +755,33 @@ int main(int argc, char** argv)
              myParam.decay);
           }
       }
-	cerr << "done." << endl;
+  cerr << "done." << endl;
 
-	if (loss_function == LogLoss)
-	{
-	    cerr << "Training log-likelihood: " << log_likelihood << endl;
+  if (loss_function == LogLoss)
+  {
+      cerr << "Training log-likelihood: " << log_likelihood << endl;
             cerr << "         perplexity:     "<< exp(-log_likelihood/training_data_size) << endl;
-	}
-	else if (loss_function == NCELoss)
-	    cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
+  }
+  else if (loss_function == NCELoss)
+      cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
 
         current_momentum += momentum_delta;
 
-	#ifdef USE_CHRONO
-	cerr << "Propagation times:";
-	for (int i=0; i<timer.size(); i++)
-	  cerr << " " << timer.get(i);
-	cerr << endl;
-	#endif
-
-	if (myParam.model_prefix != "")
-	{
-	    cerr << "Writing model" << endl;
-	    if (myParam.input_words_file != "")
-	        nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
-	    else
-	        nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
-	}
+  #ifdef USE_CHRONO
+  cerr << "Propagation times:";
+  for (int i=0; i<timer.size(); i++)
+    cerr << " " << timer.get(i);
+  cerr << endl;
+  #endif
+
+  if (myParam.model_prefix != "")
+  {
+      cerr << "Writing model" << endl;
+      if (myParam.input_words_file != "")
+          nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
+      else
+          nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
+  }
 
         if (epoch % 1 == 0 && validation_data_size > 0)
         {
@@ -793,4 +791,3 @@ int main(int argc, char** argv)
     }
     return 0;
 }
-
diff --git a/src/util.h b/src/util.h
index 3b5e6aa..6cbde9d 100644
--- a/src/util.h
+++ b/src/util.h
@@ -15,7 +15,6 @@
 #include <boost/chrono.hpp>
 #endif
 
-//#include <../3rdparty/Eigen/Dense>
 #include <Eigen/Dense>
 
 #include "maybe_omp.h"
@@ -23,15 +22,15 @@
 // Make matrices hashable
 
 namespace Eigen {
-    template <typename Derived>
-    size_t hash_value(const DenseBase<Derived> &m)
-    {
-        size_t h=0;
-	for (int i=0; i<m.rows(); i++)
-	    for (int j=0; j<m.cols(); j++)
-	        boost::hash_combine(h, m(i,j));
-	return h;
-    }
+template <typename Derived>
+size_t hash_value(const DenseBase<Derived> &m)
+{
+  size_t h=0;
+  for (int i=0; i<m.rows(); i++)
+    for (int j=0; j<m.cols(); j++)
+      boost::hash_combine(h, m(i,j));
+  return h;
+}
 }
 
 namespace nplm
@@ -73,9 +72,9 @@ void readSentFile(const std::string &file, T &sentences)
 }
 
 inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){
-        int ngram_size = ngram.size();
-        for (int i=0;i<ngram_size;i++)
-        int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
+  int ngram_size = ngram.size();
+  for (int i=0;i<ngram_size;i++)
+    int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
 }
 
 // Functions that take non-const matrices as arguments
@@ -85,186 +84,186 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra
 
 template <typename Derived>
 void initMatrix(boost::random::mt19937 &engine,
-		const Eigen::MatrixBase<Derived> &p_const,
-		bool init_normal, double range)
+                const Eigen::MatrixBase<Derived> &p_const,
+                bool init_normal, double range)
 {
-    UNCONST(Derived, p_const, p);
-    if (init_normal == 0)
-     // initialize with uniform distribution in [-range, range]
+  UNCONST(Derived, p_const, p);
+  if (init_normal == 0)
+    // initialize with uniform distribution in [-range, range]
+  {
+    boost::random::uniform_real_distribution<> unif_real(-range, range);
+    for (int i = 0; i < p.rows(); i++)
     {
-        boost::random::uniform_real_distribution<> unif_real(-range, range); 
-        for (int i = 0; i < p.rows(); i++)
-        {
-            for (int j = 0; j< p.cols(); j++)
-            {
-                p(i,j) = unif_real(engine);    
-            }
-        }
-
+      for (int j = 0; j< p.cols(); j++)
+      {
+        p(i,j) = unif_real(engine);
+      }
     }
-    else 
-      // initialize with gaussian distribution with mean 0 and stdev range
+
+  }
+  else
+    // initialize with gaussian distribution with mean 0 and stdev range
+  {
+    boost::random::normal_distribution<double> unif_normal(0., range);
+    for (int i = 0; i < p.rows(); i++)
     {
-        boost::random::normal_distribution<double> unif_normal(0., range);
-        for (int i = 0; i < p.rows(); i++)
-        {
-            for (int j = 0; j < p.cols(); j++)
-            {
-                p(i,j) = unif_normal(engine);    
-            }
-        }
+      for (int j = 0; j < p.cols(); j++)
+      {
+        p(i,j) = unif_normal(engine);
+      }
     }
+  }
 }
 
 template <typename Derived>
 void initBias(boost::random::mt19937 &engine,
-		const Eigen::MatrixBase<Derived> &p_const,
-		bool init_normal, double range)
+              const Eigen::MatrixBase<Derived> &p_const,
+              bool init_normal, double range)
 {
-    UNCONST(Derived, p_const, p);
-    if (init_normal == 0)
-     // initialize with uniform distribution in [-range, range]
+  UNCONST(Derived, p_const, p);
+  if (init_normal == 0)
+    // initialize with uniform distribution in [-range, range]
+  {
+    boost::random::uniform_real_distribution<> unif_real(-range, range);
+    for (int i = 0; i < p.size(); i++)
     {
-        boost::random::uniform_real_distribution<> unif_real(-range, range); 
-        for (int i = 0; i < p.size(); i++)
-        {
-            p(i) = unif_real(engine);    
-        }
-
+      p(i) = unif_real(engine);
     }
-    else 
-      // initialize with gaussian distribution with mean 0 and stdev range
+
+  }
+  else
+    // initialize with gaussian distribution with mean 0 and stdev range
+  {
+    boost::random::normal_distribution<double> unif_normal(0., range);
+    for (int i = 0; i < p.size(); i++)
     {
-        boost::random::normal_distribution<double> unif_normal(0., range);
-        for (int i = 0; i < p.size(); i++)
-        {
-            p(i) = unif_normal(engine);    
-        }
+      p(i) = unif_normal(engine);
     }
+  }
 }
 
 
 template <typename Derived>
 void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> &param_const)
 {
-    UNCONST(Derived, param_const, param);
+  UNCONST(Derived, param_const, param);
+
+  int i = 0;
+  std::string line;
+  std::vector<std::string> fields;
+
+  while (std::getline(TRAININ, line) && line != "")
+  {
+    splitBySpace(line, fields);
+    if (fields.size() != param.cols())
+    {
+      std::ostringstream err;
+      err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
+      throw std::runtime_error(err.str());
+    }
 
-    int i = 0;
-    std::string line;
-    std::vector<std::string> fields;
-    
-    while (std::getline(TRAININ, line) && line != "")
+    if (i >= param.rows())
     {
-        splitBySpace(line, fields);
-	if (fields.size() != param.cols())
-	{
-	    std::ostringstream err;
-	    err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
-	    throw std::runtime_error(err.str());
-	}
-	
-	if (i >= param.rows())
-	{
-	    std::ostringstream err;
-	    err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
-	    throw std::runtime_error(err.str());
-	}
-	
-	for (int j=0; j<fields.size(); j++)
-	{
-	    param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
-	}
-	i++;
+      std::ostringstream err;
+      err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
+      throw std::runtime_error(err.str());
     }
-    
-    if (i != param.rows())
+
+    for (int j=0; j<fields.size(); j++)
     {
-        std::ostringstream err;
-	err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
-	throw std::runtime_error(err.str());
+      param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
     }
+    i++;
+  }
+
+  if (i != param.rows())
+  {
+    std::ostringstream err;
+    err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
+    throw std::runtime_error(err.str());
+  }
 }
 
 template <typename Derived>
 void readMatrix(const std::string &param_file, const Eigen::MatrixBase<Derived> &param_const)
 {
-    UNCONST(Derived, param_const, param);
-    std::cerr << "Reading data from file: " << param_file << std::endl;
-    
-    std::ifstream TRAININ(param_file.c_str());
-    if (!TRAININ)
-    {
-        std::cerr << "Error: can't read training data from file " << param_file << std::endl;
-	exit(-1);
-    }
-    readMatrix(TRAININ, param);
-    TRAININ.close();
+  UNCONST(Derived, param_const, param);
+  std::cerr << "Reading data from file: " << param_file << std::endl;
+
+  std::ifstream TRAININ(param_file.c_str());
+  if (!TRAININ)
+  {
+    std::cerr << "Error: can't read training data from file " << param_file << std::endl;
+    exit(-1);
+  }
+  readMatrix(TRAININ, param);
+  TRAININ.close();
 }
 
 template <typename Derived>
 void writeMatrix(const Eigen::MatrixBase<Derived> &param, const std::string &filename)
 {
-    std::cerr << "Writing parameters to " << filename << std::endl;
+  std::cerr << "Writing parameters to " << filename << std::endl;
 
-    std::ofstream OUT;
-    OUT.precision(16);
-    OUT.open(filename.c_str());
-    if (! OUT)
-    {
-      std::cerr << "Error: can't write to file " << filename<< std::endl;
-      exit(-1);
-    }
-    writeMatrix(param, OUT);
-    OUT.close();
+  std::ofstream OUT;
+  OUT.precision(16);
+  OUT.open(filename.c_str());
+  if (! OUT)
+  {
+    std::cerr << "Error: can't write to file " << filename<< std::endl;
+    exit(-1);
+  }
+  writeMatrix(param, OUT);
+  OUT.close();
 }
 
 template <typename Derived>
 void writeMatrix(const Eigen::MatrixBase<Derived> &param, std::ofstream &OUT)
 {
-    for (int row = 0;row < param.rows();row++)
+  for (int row = 0;row < param.rows();row++)
+  {
+    int col;
+    for (col = 0;col < param.cols()-1;col++)
     {
-        int col;
-        for (col = 0;col < param.cols()-1;col++)
-        {
-            OUT<<param(row,col)<<"\t";
-        }
-        //dont want an extra tab at the end
-        OUT<<param(row,col)<<std::endl;
+      OUT<<param(row,col)<<"\t";
     }
+    //dont want an extra tab at the end
+    OUT<<param(row,col)<<std::endl;
+  }
 }
 
 template <typename Derived>
 double logsum(const Eigen::MatrixBase<Derived> &v)
 {
-    int mi; 
-    double m = v.maxCoeff(&mi);
-    double logz = 0.0;
-    for (int i=0; i<v.rows(); i++)
-        if (i != mi)
-	    logz += std::exp(v(i) - m);
-    logz = log1p(logz) + m;
-    return logz;
+  int mi;
+  double m = v.maxCoeff(&mi);
+  double logz = 0.0;
+  for (int i=0; i<v.rows(); i++)
+    if (i != mi)
+      logz += std::exp(v(i) - m);
+  logz = log1p(logz) + m;
+  return logz;
 }
 
 double logadd(double x, double y);
 
 #ifdef USE_CHRONO
-class Timer 
+class Timer
 {
-    typedef boost::chrono::high_resolution_clock clock_type;
-    typedef clock_type::time_point time_type;
-    typedef clock_type::duration duration_type;
-    std::vector<time_type> m_start;
-    std::vector<duration_type> m_total;
-public:
-    Timer() { }
-    Timer(int n) { resize(n); }
-    void resize(int n) { m_start.resize(n); m_total.resize(n); }
-    int size() const { return m_start.size(); }
-    void start(int i);
-    void stop(int i);
-    void reset(int i);
-    double get(int i) const;
+  typedef boost::chrono::high_resolution_clock clock_type;
+  typedef clock_type::time_point time_type;
+  typedef clock_type::duration duration_type;
+  std::vector<time_type> m_start;
+  std::vector<duration_type> m_total;
+ public:
+  Timer() { }
+  Timer(int n) { resize(n); }
+  void resize(int n) { m_start.resize(n); m_total.resize(n); }
+  int size() const { return m_start.size(); }
+  void start(int i);
+  void stop(int i);
+  void reset(int i);
+  double get(int i) const;
 };
 
 extern Timer timer;
author	graehl <graehl@gmail.com>	2015-06-25 09:22:21 +0300
committer	graehl <graehl@gmail.com>	2015-06-25 09:25:32 +0300
commit	37e397f526fc207dea498356e890ad085a733ae8 (patch)
tree	cfea74b92cc4d38aaff06a26c76fdba7594abd69
parent	50308d573b90ff2814bd346210fc6929bd9b40af (diff)