Copy nplm-0.1 after removing some executable bits

author: Kenneth Heafield <github@kheafield.com> 2013-10-29 22:00:37 +0400
committer: Kenneth Heafield <github@kheafield.com> 2013-10-29 22:00:37 +0400
commit: 78eecfdd7ef4cc0aef575c828c6fef747c63da19 (patch)
tree: cbd1e84c871306a35e1352286f7749ccac4f60bc /src
parent: e4138ba17732e70bfe9ad8e806173c083a9ddd0e (diff)
25 files changed, 4390 insertions, 0 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h
new file mode 100644
index 0000000..eacba14
--- /dev/null
+++ b/src/Activation_function.h
@@ -0,0 +1,119 @@
+#ifndef ACTIVATION_FUNCTION_H
+#define ACTIVATION_FUNCTION_H
+
+#include <cmath>
+#include <string>
+#include <Eigen/Dense>
+
+#include "util.h"
+
+namespace nplm
+{
+
+// is this cheating?
+using Eigen::Matrix;
+using Eigen::MatrixBase;
+
+enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunction };
+
+inline activation_function_type string_to_activation_function (const std::string &s)
+{
+    if (s == "identity")
+        return Identity;
+    else if (s == "rectifier")
+        return Rectifier;
+    else if (s == "tanh")
+        return Tanh;
+    else if (s == "hardtanh")
+        return HardTanh;
+    else
+        return InvalidFunction;
+}
+
+inline std::string activation_function_to_string (activation_function_type f)
+{
+    if (f == Identity)
+        return "identity";
+    else if (f == Rectifier)
+        return "rectifier";
+    else if (f == Tanh)
+        return "tanh";
+    else if (f == HardTanh)
+        return "hardtanh";
+}
+
+struct hardtanh_functor {
+  double operator() (double x) const { if (x < -1.) return -1.; else if (x > 1.) return 1.; else return x; }
+};
+
+struct dhardtanh_functor {
+  double operator() (double x) const { return x > -1. && x < 1. ? 1. : 0.; }
+};
+
+struct tanh_functor {
+  double operator() (double x) const { return std::tanh(x); }
+};
+
+struct dtanh_functor {
+  double operator() (double x) const { return 1-x*x; }
+};
+
+struct rectifier_functor {
+  double operator() (double x) const { return std::max(x, 0.); }
+};
+
+struct drectifier_functor {
+  double operator() (double x) const { return x > 0. ? 1. : 0.; }
+};
+
+class Activation_function
+{
+    private:
+        int size;
+	activation_function_type f;
+
+    public:
+        Activation_function() : size(0), f(Rectifier) { }
+
+	void resize(int size) { this->size = size; }
+	void set_activation_function(activation_function_type f) { this->f = f; }
+
+	template <typename Engine>
+	void initialize(Engine &engine, bool init_normal, double init_range) { }
+
+	int n_inputs () const { return size; }
+	int n_outputs () const { return size; }
+
+        template <typename DerivedIn, typename DerivedOut>
+	void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
+        {
+	    UNCONST(DerivedOut, output, my_output);
+
+	    switch (f)
+	    {
+	    case Identity: my_output = input; break;
+	    case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
+	    case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
+	    case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
+	    }
+        }
+
+        template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
+	void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output,
+		   const MatrixBase<DerivedIn> &finput, const MatrixBase<DerivedOut> &foutput) const
+        {
+	    UNCONST(DerivedGIn, output, my_output);
+
+	    switch (f)
+	    {
+	    case Identity: my_output = input; break;
+	    case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
+	    case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
+	    case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
+	    }
+        }
+};
+
+} // namespace nplm
+
+#endif
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..3b34fe9
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,175 @@
+### Compilation options.
+
+# C++ compiler. Tested with g++ and Intel icpc.
+CXX=g++
+#CXX=icpc
+
+# Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance!
+#CFLAGS=-g
+CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG 
+
+# Architecture. Set to x86_64 or i686 to override.
+ARCH:=$(shell uname -m)
+# Operating system. Set to override (the only option that makes any difference is Darwin).
+OS:=$(shell uname -s)
+
+# To build static binaries, uncomment the line below:
+#STATIC=1
+
+### Required libraries. You must install these prior to building.
+
+# Set this to the root directory of Boost (should have a subdirectory named boost):
+BOOST=/usr/usc/boost/1.51.0
+#BOOST=/usr
+#BOOST=/opt/local
+# Where to find Boost header files
+BOOST_INC=$(BOOST)/include
+
+# Set this to the root directory of Eigen (should have a subdirectory named Eigen):
+EIGEN=../3rdparty/eigen
+
+### Optional libraries.
+
+# To disable multithreading, comment out the line below:
+OMP=1
+
+# To use the MKL library, uncomment the line below and set it to the MKL root:
+MKL=/usr/usc/intel/12.1.1/mkl
+
+# For Python bindings, set the following and run 'make python/nplm.so'.
+PYTHON_VERSION=2.7
+#PYTHON_ROOT=/opt/local/Library/Frameworks/Python.framework/Versions/$(PYTHON_VERSION)
+PYTHON_ROOT=/home/nlg-01/chiangd/pkg64/python
+CYTHON=$(PYTHON_ROOT)/bin/cython
+
+##### End of configurable options #####
+
+# used for profiling
+#USE_CHRONO=1
+
+TCLAP=../3rdparty/tclap/include
+
+# Currently, this is needed only if USE_CHRONO is defined:
+# Where to find Boost libraries
+BOOST_LIB=$(BOOST)/lib
+# On some systems, a suffix is appended for the multithreaded version.
+BOOST_LIB_SUFFIX=
+#BOOST_LIB_SUFFIX=-mt
+
+BOOST_CFLAGS=-I$(BOOST_INC)
+BOOST_LDFLAGS=
+ifdef USE_CHRONO
+  BOOST_CFLAGS+=-DUSE_CHRONO
+  BOOST_LDLIBS+=-lboost_system$(BOOST_LIB_SUFFIX) -lboost_chrono$(BOOST_LIB_SUFFIX)
+endif
+ifdef BOOST_LDLIBS
+  BOOST_LDFLAGS+=-L$(BOOST_LIB) -Wl,-rpath -Wl,$(BOOST_LIB)
+endif
+
+ifdef OMP
+  ifneq (,$(findstring g++,$(CXX)))
+    OMP_CFLAGS=-fopenmp
+    OMP_LDFLAGS=-fopenmp
+  endif
+  ifneq (,$(findstring icpc,$(CXX)))
+    OMP_CFLAGS=-openmp
+    OMP_LDFLAGS=-openmp
+  endif
+endif
+
+ifdef MKL
+  MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL
+  MKL_LDLIBS=-Wl,--start-group
+  ifeq ($(ARCH),x86_64)
+    MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64
+    MKL_LDLIBS+=-lmkl_intel_lp64
+  endif
+  ifeq ($(ARCH),i686)
+    MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32
+    MKL_LDLIBS+=-lmkl_intel
+  endif
+
+  ifneq (,$(findstring g++,$(CXX)))
+    MKL_LDLIBS+=-lmkl_gnu_thread
+  endif
+  ifneq (,$(findstring icpc,$(CXX)))
+    MKL_LDLIBS+=-lmkl_intel_thread
+  endif
+
+  #MKL_LDLIBS=-lmkl_rt
+  MKL_LDLIBS+=-lmkl_core -Wl,--end-group
+endif
+
+ifdef STATIC
+  LDFLAGS+=-static
+endif
+
+ALL_CFLAGS=$(OMP_CFLAGS) $(MKL_CFLAGS) $(BOOST_CFLAGS) -I$(TCLAP) -I$(EIGEN) $(CFLAGS)
+ALL_LDFLAGS=$(OMP_LDFLAGS) $(MKL_LDFLAGS) $(BOOST_LDFLAGS) $(LDFLAGS)
+ALL_LDLIBS=$(MKL_LDLIBS) $(BOOST_LDLIBS)
+
+PYTHON_CFLAGS+=-I$(PYTHON_ROOT)/include/python$(PYTHON_VERSION)
+ifeq ($(OS),Darwin)
+  # avoid having to link in libpython
+  PYTHON_LDFLAGS+=-undefined dynamic_lookup
+endif
+
+# Some other programs
+
+AR=ar
+RANLIB=ranlib
+
+# Rules
+
+BINS=trainNeuralNetwork testNeuralNetwork prepareNeuralLM testNeuralLM prepareNeuralTM
+LIBS=neuralLM.a neuralLM.so
+OBJS=util.o model.o
+
+all: $(BINS) $(LIBS)
+
+clean:
+	rm -f *.o shared/*.o python/*.o $(BINS) $(LIBS) python/nplm.{cpp,so}
+
+install: all
+	mkdir -p ../bin
+	cp $(BINS) ../bin
+	mkdir -p ../lib
+	cp $(LIBS) ../lib
+
+%.o: %.cpp
+	$(CXX) -c $(ALL_CFLAGS) $< -o $@
+
+shared/%.o: %.cpp
+	$(CXX) -c -fPIC $(ALL_CFLAGS) $< -o $@
+
+trainNeuralNetwork: trainNeuralNetwork.o $(OBJS)
+	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
+testNeuralNetwork: testNeuralNetwork.o $(OBJS)
+	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
+prepareNeuralLM: prepareNeuralLM.o $(OBJS)
+	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
+testNeuralLM: testNeuralLM.o $(OBJS)
+	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
+prepareNeuralTM: prepareNeuralTM.o $(OBJS)
+	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
+neuralLM.a: neuralLM.o $(OBJS)
+	rm -f $@
+	$(AR) rv $@ $^
+	$(RANLIB) $@
+
+neuralLM.so: $(addprefix shared/,neuralLM.o $(OBJS))
+	$(CXX) -shared $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
+python/nplm.cpp: python/nplm.pyx
+	$(CYTHON) --cplus $^
+
+python/nplm.o: python/nplm.cpp
+	$(CXX) -c -fPIC -I. $(ALL_CFLAGS) $(PYTHON_CFLAGS) $< -o $@
+
+python/nplm.so: python/nplm.o $(addprefix shared/,neuralLM.o $(OBJS))
+	$(CXX) -shared $(ALL_LDFLAGS) $(PYTHON_LDFLAGS) $^ $(ALL_LDLIBS) $(PYTHON_LDLIBS) -o $@ 
diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h
new file mode 100644
index 0000000..77d94ca
--- /dev/null
+++ b/src/SoftmaxLoss.h
@@ -0,0 +1,136 @@
+#ifndef SOFTMAXLOSS_H
+#define SOFTMAXLOSS_H
+
+#include <Eigen/Dense>
+#include "multinomial.h"
+#include "util.h"
+
+namespace nplm
+{
+
+// is this cheating?
+using Eigen::Matrix;
+using Eigen::MatrixBase;
+using Eigen::Dynamic;
+
+///// Softmax layer plus log-loss function.
+
+enum loss_function_type { LogLoss, NCELoss, InvalidLoss };
+
+inline loss_function_type string_to_loss_function (const std::string &s)
+{
+    if (s == "log")
+        return LogLoss;
+    else if (s == "nce")
+        return NCELoss;
+    else
+        return InvalidLoss;
+}
+
+inline std::string loss_function_to_string (loss_function_type f)
+{
+    if (f == LogLoss)
+        return "log";
+    else if (f == NCELoss)
+        return "nce";
+}
+
+/// Note: Outputs log-probabilities.
+
+struct SoftmaxLogLoss
+{
+    template <typename DerivedI, typename DerivedW, typename DerivedO>
+    void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+    {
+        UNCONST(DerivedO, output_const, output);
+
+	double log_likelihood = 0.0;
+
+        #pragma omp parallel for reduction(+:log_likelihood)
+	for (int train_id = 0; train_id < input.cols(); train_id++)
+	{
+	    double normalization = logsum(input.col(train_id));
+	    output.col(train_id).array() = input.col(train_id).array() - normalization;
+	    log_likelihood += output(output_words(train_id), train_id);
+	}
+	loss = log_likelihood;
+    }
+
+    template <typename DerivedW, typename DerivedO, typename DerivedI>
+    void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+    {
+        UNCONST(DerivedI, grad_input_const, grad_input);
+        grad_input.setZero();
+        #pragma omp parallel for
+	for (int train_id = 0; train_id < output.cols(); train_id++)
+	{
+	    grad_input(output_words(train_id), train_id) += 1.;
+	    grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
+	}
+    }
+};
+
+///// Softmax layer plus NCE loss function.
+
+///// Note: Outputs probabilities.
+
+///// Note: Unlike SoftmaxLogLoss, does not compute *or* apply precomputed
+///// normalizations. Currently the caller is expected to do normalization.
+
+template <typename Multinomial>
+class SoftmaxNCELoss
+{
+    const Multinomial &unigram;
+
+public:
+    SoftmaxNCELoss(const Multinomial &unigram) 
+      : unigram(unigram)
+    {
+    }
+
+    template <typename DerivedI, typename DerivedW, typename DerivedO>
+    void fProp(const MatrixBase<DerivedI> &scores, 
+	       const MatrixBase<DerivedW> &minibatch_samples,
+	       const MatrixBase<DerivedO> &output_const, double &loss)
+    {
+        UNCONST(DerivedO, output_const, output);
+	double log_likelihood = 0.0;
+	int num_noise_samples = minibatch_samples.rows()-1;
+	double log_num_noise_samples = std::log(num_noise_samples);
+        #pragma omp parallel for reduction(+:log_likelihood) schedule(static)
+	for (int train_id = 0; train_id < scores.cols(); train_id++)
+	{
+	    for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
+	    {
+	        int sample = minibatch_samples(sample_id, train_id);
+		// To avoid zero or infinite probabilities,
+		// never take exp of score without normalizing first,
+		// even if it's a little slower...
+		double score = scores(sample_id, train_id);
+		double score_noise = log_num_noise_samples + unigram.logprob(sample);
+		double z = logadd(score, score_noise);
+		double logprob = score - z;
+		double logprob_noise = score_noise - z;
+		output(sample_id, train_id) = std::exp(logprob);
+		log_likelihood += sample_id == 0 ? logprob : logprob_noise;
+	    }
+	}
+	loss = log_likelihood;
+    }
+
+    template <typename DerivedO, typename DerivedI>
+    void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+    {
+        UNCONST(DerivedI, output_const, output);
+        #pragma omp parallel for schedule(static)
+	for (int train_id = 0; train_id < probs.cols(); train_id++)
+	{
+	    output.col(train_id) = -probs.col(train_id);
+	    output(0, train_id) += 1.0;
+	}
+    }
+};
+
+} // namespace nplm
+
+#endif
diff --git a/src/USCMatrix.h b/src/USCMatrix.h
new file mode 100644
index 0000000..caa9553
--- /dev/null
+++ b/src/USCMatrix.h
@@ -0,0 +1,192 @@
+#ifndef USCMATRIX_H
+#define USCMATRIX_H
+
+#include <Eigen/Dense>
+#include "maybe_omp.h"
+#include "util.h"
+
+namespace nplm
+{
+
+// is this cheating?
+using Eigen::Matrix;
+using Eigen::MatrixBase;
+using Eigen::Dynamic;
+
+// USC = Uniform Sparse Columns. A USCMatrix is a sparse matrix in which
+// each column has exactly k nonzero entries. This allows for a
+// simpler and faster compressed representation.
+
+// A USCMatrix can be converted into CSC format fairly easily, by
+// adding a third array [0, k, 2k, ..., nk]. However, the indices will
+// not be unique.
+
+// We use:
+//       dense2 = dense1^T * sparse (output bProp, input fProp)
+//       dense1 = sparse * dense2^T (output computeGradient, input computeGradient)
+// where:
+//       sparse is vocab_size x minibatch_size
+//       dense1 is vocab_size x embedding_dimension
+//       dense2 is embedding_dimension x minibatch_size
+
+template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_INDEX_TYPE but int is smaller
+class USCMatrix
+{
+
+public:
+    Matrix<Index,Dynamic,Dynamic> indexes;
+    Matrix<Scalar,Dynamic,Dynamic> values;
+    int m_rows;
+
+    USCMatrix() : m_rows(0) { }
+
+    template <typename Indexes, typename Values>
+    USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) 
+    : 
+      indexes(indexes), 
+      values(values), 
+      m_rows(rows) 
+    { }
+
+    USCMatrix(Index rows, Index nnz, Index cols) 
+    : 
+      indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), 
+      values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)),
+      m_rows(rows)
+    { 
+        this->indexes.fill(-1); 
+    }
+
+    Index rows() const { return m_rows; }
+    Index cols() const { return indexes.cols(); }
+
+    void resize(Index rows, Index nnz, Index cols) {
+        indexes.resize(nnz, cols);
+        values.resize(nnz, cols);
+	m_rows = rows;
+    }
+};
+
+// Dense matrix - sparse matrix product
+// a is presumably very wide
+template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC>
+void uscgemm(double alpha, const MatrixBase<DerivedA> &a, 
+	     const USCMatrix<ScalarB,Index> &b,
+	     const MatrixBase<DerivedC> &c_const)
+{
+    UNCONST(DerivedC, c_const, c);
+    eigen_assert(a.rows() == c.rows());
+    eigen_assert(a.cols() == b.rows());
+    eigen_assert(b.cols() == c.cols());
+
+    #pragma omp parallel for
+    for (Index k=0; k<b.cols(); k++)
+        for (Index r=0; r<b.indexes.rows(); r++)
+	{
+	    Index j = b.indexes(r,k);
+	    eigen_assert(j >= 0);
+	    eigen_assert(j < a.cols());
+	    c.col(k) += alpha * a.col(j) * b.values(r,k);
+	}
+}
+
+// sparse matrix - dense matrix product
+template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
+void uscgemm(double alpha, 
+	     const USCMatrix<ScalarA,Index> &a,
+	     const MatrixBase<DerivedB> &b, 
+	     const MatrixBase<DerivedC> &c_const)
+{
+    UNCONST(DerivedC, c_const, c);
+    eigen_assert(a.rows() == c.rows());
+    eigen_assert(a.cols() == b.rows());
+    eigen_assert(b.cols() == c.cols());
+
+    // This needs to be tuned for each system, unfortunately,
+    // and seems to vary a lot. A lot.
+    int i_blocks = omp_get_num_threads()*16;
+
+    // Assume only one block in k direction.
+    // We don't need to explicitly block in the j direction.
+    #pragma omp parallel for
+    for (Index ib=0; ib<i_blocks; ib++)
+        for (Index j=0; j<a.cols(); j++)
+	    for (Index r=0; r<a.indexes.rows(); r++)
+	    {
+	        Index i = a.indexes(r,j);
+		eigen_assert(i >= 0);
+		eigen_assert(i < c.rows());
+		if (i % i_blocks == ib)
+		    c.row(i) += alpha * a.values(r,j) * b.row(j);
+	    }
+
+    /*
+    If c.cols() is really large, then theoretically it seems like we should do:
+
+    parallel for blocks in i direction
+        for blocks in j direction
+            pack block of a into smaller sparse matrix
+            for blocks in k direction
+                for k
+                    for i (sparse)
+                        for j
+                            c(i,k) += a(i,j) * b(j,k)
+
+    However, the copying of blocks of a doesn't seem practical for any realistic
+    sizes of c.cols().
+    */
+}
+
+// Dense matrix - dense matrix product, but masked by a sparse matrix,
+// that is, compute a*b only for those positions in c.indexes, and put
+// them in c.values.
+
+// a is presumably a very tall matrix. Row-major order is preferred.
+// For b, column-major is preferred.
+
+template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index>
+void uscgemm_masked(double alpha,
+		    const MatrixBase<DerivedA> &a,
+		    const MatrixBase<DerivedB> &b,
+		    USCMatrix<ScalarC,Index> &c)
+{
+    eigen_assert(a.rows() == c.rows());
+    eigen_assert(a.cols() == b.rows());
+    eigen_assert(b.cols() == c.cols());
+
+    #pragma omp parallel for
+    for (Index k=0; k<b.cols(); k++)
+        for (Index r=0; r<c.indexes.rows(); r++)
+	{
+	    Index i = c.indexes(r, k);
+	    eigen_assert(i >= 0);
+	    eigen_assert(i < a.rows());
+	    c.values(r, k) += alpha * a.row(i) * b.col(k);
+	}
+}
+
+// sparse matrix - dense vector product
+template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
+void uscgemv(double alpha, 
+	     const USCMatrix<ScalarA,Index> &a,
+	     const MatrixBase<DerivedB> &b,
+	     const MatrixBase<DerivedC> &c_const)
+{
+    UNCONST(DerivedC, c_const, c);
+    eigen_assert(a.rows() == c.rows());
+    eigen_assert(a.cols() == b.rows());
+    eigen_assert(b.cols() == 1 && c.cols() == 1);
+
+    for (Index j=0; j<a.cols(); j++)
+        for (Index r=0; r<a.indexes.rows(); r++)
+	{
+	    Index i = a.indexes(r,j);
+	    eigen_assert(i >= 0);
+	    eigen_assert(i < c.rows());
+	    c(i) += alpha * a.values(r,j) * b(j);
+	}
+}
+
+}
+
+#endif
diff --git a/src/graphClasses.h b/src/graphClasses.h
new file mode 100644
index 0000000..da5f1af
--- /dev/null
+++ b/src/graphClasses.h
@@ -0,0 +1,60 @@
+//creating the structure of the nn in a graph that will help in performing backpropagation and forward propagation
+#pragma once
+
+#include <cstdlib>
+#include "neuralClasses.h"
+#include <Eigen/Dense>
+
+namespace nplm
+{
+
+template <class X>
+class Node {
+    public:
+        X * param; //what parameter is this
+        //vector <void *> children;
+        //vector <void *> parents;
+	Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
+	Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
+	int minibatch_size;
+
+    public:
+        Node() : param(NULL), minibatch_size(0) { }
+
+        Node(X *input_param, int minibatch_size)
+	  : param(input_param),
+	    minibatch_size(minibatch_size)
+        {
+	    resize(minibatch_size);
+        }
+
+	void resize(int minibatch_size)
+	{
+	    this->minibatch_size = minibatch_size;
+	    if (param->n_outputs() != -1)
+	    {
+	        fProp_matrix.setZero(param->n_outputs(), minibatch_size);
+	    }
+            if (param->n_inputs() != -1)
+            {
+	        bProp_matrix.setZero(param->n_inputs(), minibatch_size);
+            }
+	}
+
+	void resize() { resize(minibatch_size); }
+
+        /*
+        void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
+        {
+            param->fProp(input,fProp_matrix,0,0,n_cols);
+        }
+        void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
+        {
+            param->fProp(input,fProp_matrix,0,0,n_cols);
+        }
+        */
+        //for f prop, just call the fProp node of the particular parameter. 
+
+};
+
+} // namespace nplm
diff --git a/src/maybe_omp.h b/src/maybe_omp.h
new file mode 100644
index 0000000..562dea6
--- /dev/null
+++ b/src/maybe_omp.h
@@ -0,0 +1,13 @@
+#ifndef MAYBE_OMP
+#define MAYBE_OMP
+
+#ifdef _OPENMP
+  #include <omp.h>
+#else
+  #define omp_get_thread_num(x) 0
+  #define omp_set_num_threads(n)
+  #define omp_get_num_threads() 1
+  #define omp_get_max_threads() 1
+#endif
+
+#endif
diff --git a/src/model.cpp b/src/model.cpp
new file mode 100644
index 0000000..3611975
--- /dev/null
+++ b/src/model.cpp
@@ -0,0 +1,246 @@
+#include <cstdlib>
+#include <iostream>
+#include <boost/lexical_cast.hpp>
+
+#include "model.h"
+#include "param.h"
+
+using namespace std;
+using namespace boost;
+using namespace boost::random;
+
+namespace nplm
+{
+
+    void model::resize(int ngram_size,
+        int input_vocab_size,
+        int output_vocab_size,
+        int input_embedding_dimension,
+        int num_hidden,
+        int output_embedding_dimension)
+{
+    input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
+    first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+    first_hidden_activation.resize(num_hidden);
+    second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+    second_hidden_activation.resize(output_embedding_dimension);
+    output_layer.resize(output_vocab_size, output_embedding_dimension);
+    this->ngram_size = ngram_size;
+    this->input_vocab_size = input_vocab_size;
+    this->output_vocab_size = output_vocab_size;
+    this->input_embedding_dimension = input_embedding_dimension;
+    this->num_hidden = num_hidden;
+    this->output_embedding_dimension = output_embedding_dimension;
+    premultiplied = false;
+}
+  
+void model::initialize(mt19937 &init_engine, bool init_normal, double init_range, double init_bias)
+{
+    input_layer.initialize(init_engine, init_normal, init_range);
+    output_layer.initialize(init_engine, init_normal, init_range, init_bias);
+    first_hidden_linear.initialize(init_engine, init_normal, init_range);
+    second_hidden_linear.initialize(init_engine, init_normal, init_range);
+}
+
+void model::premultiply()
+{
+    // Since input and first_hidden_linear are both linear,
+    // we can multiply them into a single linear layer *if* we are not training
+    int context_size = ngram_size-1;
+    Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
+    first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+    for (int i=0; i<context_size; i++)
+        first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
+    input_layer.W->resize(1,1); // try to save some memory
+    premultiplied = true;
+}
+
+void model::readConfig(ifstream &config_file)
+{
+    string line;
+    vector<string> fields;
+    int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
+    activation_function_type activation_function = this->activation_function;
+    while (getline(config_file, line) && line != "")
+    {
+        splitBySpace(line, fields);
+	if (fields[0] == "ngram_size")
+	    ngram_size = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "vocab_size")
+	    input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "input_vocab_size")
+	    input_vocab_size = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "output_vocab_size")
+	    output_vocab_size = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "input_embedding_dimension")
+	    input_embedding_dimension = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "num_hidden")
+	    num_hidden = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "output_embedding_dimension")
+	    output_embedding_dimension = lexical_cast<int>(fields[1]);
+	else if (fields[0] == "activation_function")
+	    activation_function = string_to_activation_function(fields[1]);
+	else if (fields[0] == "version")
+	{
+	    int version = lexical_cast<int>(fields[1]);
+	    if (version != 1)
+	    {
+		cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
+		exit(1);
+	    }
+	}
+	else
+	    cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+    }
+    resize(ngram_size,
+        input_vocab_size,
+        output_vocab_size,
+        input_embedding_dimension,
+        num_hidden,
+        output_embedding_dimension);
+    set_activation_function(activation_function);
+}
+
+void model::readConfig(const string &filename)
+{
+    ifstream config_file(filename.c_str());
+    if (!config_file)
+    {
+        cerr << "error: could not open config file " << filename << endl;
+	exit(1);
+    }
+    readConfig(config_file);
+    config_file.close();
+}
+ 
+void model::read(const string &filename)
+{
+    vector<string> input_words;
+    vector<string> output_words;
+    read(filename, input_words, output_words);
+}
+
+void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words)
+{
+    ifstream file(filename.c_str());
+    if (!file) throw runtime_error("Could not open file " + filename);
+    
+    param myParam;
+    string line;
+    
+    while (getline(file, line))
+    {
+	if (line == "\\config")
+	{
+	    readConfig(file);
+	}
+
+	else if (line == "\\vocab")
+	{
+	    input_words.clear();
+	    readWordsFile(file, input_words);
+	    output_words = input_words;
+	}
+
+	else if (line == "\\input_vocab")
+	{
+	    input_words.clear();
+	    readWordsFile(file, input_words);
+	}
+
+	else if (line == "\\output_vocab")
+	{
+	    output_words.clear();
+	    readWordsFile(file, output_words);
+	}
+
+	else if (line == "\\input_embeddings")
+	    input_layer.read(file);
+	else if (line == "\\hidden_weights 1")
+	    first_hidden_linear.read(file);
+	else if (line == "\\hidden_weights 2")
+	    second_hidden_linear.read(file);
+	else if (line == "\\output_weights")
+	    output_layer.read_weights(file);
+	else if (line == "\\output_biases")
+	    output_layer.read_biases(file);
+	else if (line == "\\end")
+	    break;
+	else if (line == "")
+	    continue;
+	else
+	{
+	    cerr << "warning: unrecognized section: " << line << endl;
+	    // skip over section
+	    while (getline(file, line) && line != "") { }
+	}
+    }
+    file.close();
+}
+
+    void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
+{ 
+    write(filename, &input_words, &output_words);
+}
+
+void model::write(const string &filename) 
+{ 
+    write(filename, NULL, NULL);
+}
+
+    void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
+{
+    ofstream file(filename.c_str());
+    if (!file) throw runtime_error("Could not open file " + filename);
+    
+    file << "\\config" << endl;
+    file << "version 1" << endl;
+    file << "ngram_size " << ngram_size << endl;
+    file << "input_vocab_size " << input_vocab_size << endl;
+    file << "output_vocab_size " << output_vocab_size << endl;
+    file << "input_embedding_dimension " << input_embedding_dimension << endl;
+    file << "num_hidden " << num_hidden << endl;
+    file << "output_embedding_dimension " << output_embedding_dimension << endl;
+    file << "activation_function " << activation_function_to_string(activation_function) << endl;
+    file << endl;
+    
+    if (input_pwords)
+    {
+        file << "\\input_vocab" << endl;
+	writeWordsFile(*input_pwords, file);
+	file << endl;
+    }
+
+    if (output_pwords)
+    {
+        file << "\\output_vocab" << endl;
+	writeWordsFile(*output_pwords, file);
+	file << endl;
+    }
+
+    file << "\\input_embeddings" << endl;
+    input_layer.write(file);
+    file << endl;
+    
+    file << "\\hidden_weights 1" << endl;
+    first_hidden_linear.write(file);
+    file << endl;
+    
+    file << "\\hidden_weights 2" << endl;
+    second_hidden_linear.write(file);
+    file << endl;
+    
+    file << "\\output_weights" << endl;
+    output_layer.write_weights(file);
+    file << endl;
+    
+    file << "\\output_biases" << endl;
+    output_layer.write_biases(file);
+    file << endl;
+    
+    file << "\\end" << endl;
+    file.close();
+}
+
+
+} // namespace nplm
diff --git a/src/model.h b/src/model.h
new file mode 100644
index 0000000..271b22f
--- /dev/null
+++ b/src/model.h
@@ -0,0 +1,105 @@
+#ifndef MODEL_H
+#define MODEL_H
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <boost/random/mersenne_twister.hpp>
+
+#include "neuralClasses.h"
+#include "Activation_function.h"
+
+namespace nplm
+{
+
+class model {
+public:
+    Input_word_embeddings input_layer;
+    Linear_layer first_hidden_linear;
+    Activation_function first_hidden_activation;
+    Linear_layer second_hidden_linear;
+    Activation_function second_hidden_activation;
+    Output_word_embeddings output_layer;
+    Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix,
+      input_embedding_matrix,
+      input_and_output_embedding_matrix;
+    
+    activation_function_type activation_function;
+    int ngram_size, input_vocab_size, output_vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
+    bool premultiplied;
+
+    model(int ngram_size,
+        int input_vocab_size,
+        int output_vocab_size,
+        int input_embedding_dimension,
+        int num_hidden,
+        int output_embedding_dimension,
+        bool share_embeddings) 
+    {
+        if (share_embeddings){
+          input_and_output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>();
+          input_layer.set_W(&input_and_output_embedding_matrix);
+          output_layer.set_W(&input_and_output_embedding_matrix);
+        }
+        else {
+          input_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>();
+          output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>();
+          input_layer.set_W(&input_embedding_matrix);
+          output_layer.set_W(&output_embedding_matrix);
+        }
+        resize(ngram_size,
+            input_vocab_size,
+            output_vocab_size,
+            input_embedding_dimension,
+            num_hidden,
+            output_embedding_dimension);
+    }
+    model() : ngram_size(1), 
+            premultiplied(false),
+            activation_function(Rectifier),
+            output_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>()),
+            input_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>())
+        {
+          output_layer.set_W(&output_embedding_matrix);
+          input_layer.set_W(&input_embedding_matrix);
+        }
+
+    void resize(int ngram_size,
+        int input_vocab_size,
+        int output_vocab_size,
+        int input_embedding_dimension,
+        int num_hidden,
+        int output_embedding_dimension);
+
+    void initialize(boost::random::mt19937 &init_engine,
+        bool init_normal,
+        double init_range,
+        double init_bias);
+    void set_activation_function(activation_function_type f)
+    {
+        activation_function = f;
+        first_hidden_activation.set_activation_function(f);
+        second_hidden_activation.set_activation_function(f);
+    }
+
+    void premultiply();
+
+    // Since the vocabulary is not essential to the model,
+    // we need a version with and without a vocabulary.
+    // If the number of "extra" data structures like this grows,
+    // a better solution is needed
+
+    void read(const std::string &filename);
+    void read(const std::string &filename, std::vector<std::string> &input_words, std::vector<std::string> &output_words);
+    void write(const std::string &filename, const std::vector<std::string> &input_words, const std::vector<std::string> &output_words);
+    void write(const std::string &filename);
+
+ private:
+    void readConfig(std::ifstream &config_file);
+    void readConfig(const std::string &filename);
+    void write(const std::string &filename, const std::vector<std::string> *input_pwords, const std::vector<std::string> *output_pwords);
+};
+
+} //namespace nplm
+
+#endif
diff --git a/src/multinomial.h b/src/multinomial.h
new file mode 100644
index 0000000..1314fcb
--- /dev/null
+++ b/src/multinomial.h
@@ -0,0 +1,135 @@
+#ifndef MULTINOMIAL_H
+#define MULTINOMIAL_H
+
+#include <vector>
+#include <set>
+#include <cassert>
+#include <cmath>
+
+#include <boost/random/uniform_int_distribution.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+
+namespace nplm
+{
+
+template <typename Count>
+class multinomial {
+  std::vector<int> J;
+  std::vector<double> q;
+  boost::random::uniform_int_distribution<Count> unif_int;
+  boost::random::uniform_real_distribution<> unif_real;
+  std::vector<double> m_prob, m_logprob;
+
+public:
+  multinomial() : unif_real(0.0, 1.0) { }
+  multinomial(const std::vector<Count> &counts) : unif_real(0.0, 1.0) { estimate(counts);  }
+
+  void estimate(const std::vector<Count>& counts)
+  {
+      int k = counts.size();
+      Count n = 0;
+      m_prob.clear();
+      m_prob.resize(k, 0.0);
+      m_logprob.clear();
+      m_logprob.resize(k, 0.0);
+      for (int i=0; i<k; i++)
+          n += counts[i];
+      for (int i=0; i<k; i++)
+      {
+          m_prob[i] = static_cast<double>(counts[i]) / n;
+	  m_logprob[i] = std::log(m_prob[i]);
+      }
+      setup(m_prob);
+  }
+
+  double prob(int i) const { return m_prob[i]; }
+  double logprob(int i) const { return m_logprob[i]; }
+
+  template <typename Engine>
+  int sample(Engine &eng) const
+  {
+      int m = unif_int(eng);
+      double p = unif_real(eng);
+      int s;
+      if (q[m] > p)
+	  s = m;
+      else
+          s = J[m];
+      assert (s >= 0);
+      return s;
+  }
+
+private:
+ void setup(const std::vector<double>& probs)
+  {
+    int k = probs.size();
+
+    unif_int = boost::random::uniform_int_distribution<Count>(0, k-1);
+    J.resize(k, -1);
+    q.resize(k, 0);
+    
+    // "small" outcomes (prob < 1/k)
+    std::set<int> S;
+    std::set<int>::iterator s_it;
+    // "large" outcomes (prob >= 1/k)
+    std::set<int> L;
+    std::set<int>::iterator l_it;
+    const double tol = 1e-3;
+    
+    for (int i=0; i<k; i++) 
+    {
+        q[i] = k*probs[i];
+        if (q[i] < 1.0)
+        {
+            S.insert(i);
+        }
+        else
+        {
+            L.insert(i);
+        } 
+    }
+
+    while (S.size() > 0 && L.size() > 0)
+    {
+        // choose an arbitrary element s from S and l from L
+        s_it = S.begin();
+        int s = *s_it;
+        l_it = L.begin();
+        int l = *l_it;
+
+	// pair up s and (part of) l as its alias
+        J[s] = l;
+        S.erase(s_it);
+        //q[l] = q[l] - (1.0 - q[s]);
+	q[l] = q[l] + q[s] - 1.0; // more stable?
+
+	// move l from L to S if necessary
+        if (q[l] < 1.0)
+        {
+            S.insert(l);
+            L.erase(l_it);
+        }
+    }
+
+    // any remaining elements must have q/n close to 1, so we leave them alone
+    for (s_it = S.begin(); s_it != S.end(); ++s_it) {
+      //assert (fabs(q[*s_it] - 1) < tol);
+      if (std::fabs(q[*s_it] - 1) > tol)
+      {
+	std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*s_it]-1) << std::endl;
+      }
+      q[*s_it] = 1.0;
+    }
+    for (l_it = L.begin(); l_it != L.end(); ++l_it) {
+      if (std::fabs(q[*l_it] - 1) > tol)
+      {
+	std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*l_it]-1) << std::endl;
+      }
+    }
+  }
+
+};
+
+} // namespace nplm
+
+#endif
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
new file mode 100644
index 0000000..afd91f1
--- /dev/null
+++ b/src/neuralClasses.h
@@ -0,0 +1,520 @@
+#pragma once
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <vector>
+
+#include <boost/unordered_map.hpp> 
+#include <Eigen/Dense>
+#include "maybe_omp.h"
+
+#include "util.h"
+#include "graphClasses.h"
+#include "USCMatrix.h"
+
+// classes for various kinds of layers
+#include "SoftmaxLoss.h"
+#include "Activation_function.h"
+
+//#define EIGEN_DONT_PARALLELIZE
+//#define EIGEN_DEFAULT_TO_ROW_MAJOR
+
+namespace nplm
+{
+
+// is this cheating?
+using Eigen::Matrix;
+using Eigen::MatrixBase;
+using Eigen::Dynamic;
+
+typedef boost::unordered_map<int,bool> int_map;
+
+class Linear_layer
+{
+    private: 
+        Matrix<double,Dynamic,Dynamic> U;
+        Matrix<double,Dynamic,Dynamic> U_gradient;
+        Matrix<double,Dynamic,Dynamic> U_velocity;
+        Matrix<double,Dynamic,Dynamic> U_running_gradient;
+
+    friend class model;
+
+    public:
+	Linear_layer() { }
+        Linear_layer(int rows, int cols) { resize(rows, cols); }
+
+	void resize(int rows, int cols)
+	{
+	    U.setZero(rows, cols);
+      U_gradient.setZero(rows, cols);
+      U_running_gradient.setZero(rows, cols);
+      U_velocity.setZero(rows, cols);
+	}
+
+	void read(std::ifstream &U_file) { readMatrix(U_file, U); }
+	void write(std::ofstream &U_file) { writeMatrix(U, U_file); }
+
+	template <typename Engine>
+	void initialize(Engine &engine, bool init_normal, double init_range)
+	{
+	    initMatrix(engine, U, init_normal, init_range);
+	}	  
+
+	int n_inputs () const { return U.cols(); }
+	int n_outputs () const { return U.rows(); }
+
+        template <typename DerivedIn, typename DerivedOut>
+	void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
+        {
+	    UNCONST(DerivedOut, output, my_output);
+	    my_output.leftCols(input.cols()).noalias() = U*input;
+        }
+
+	// Sparse input
+  template <typename ScalarIn, typename DerivedOut>
+	void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const
+  {
+	    UNCONST(DerivedOut, output_const, output);
+	    output.setZero();
+	    uscgemm(1.0, U, input, output.leftCols(input.cols()));
+  }
+
+        template <typename DerivedGOut, typename DerivedGIn>
+	void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const
+        {
+	    UNCONST(DerivedGIn, output, my_output);
+	    my_output.noalias() = U.transpose()*input;
+	}
+
+      template <typename DerivedGOut, typename DerivedIn>
+      void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, 
+         const MatrixBase<DerivedIn> &fProp_input, 
+         double learning_rate, double momentum, double L2_reg)
+      {
+	    U_gradient.noalias() = bProp_input*fProp_input.transpose();
+
+	    // This used to be multithreaded, but there was no measureable difference
+	    if (L2_reg > 0.0)
+	    {
+	        U_gradient *= 1 - 2*L2_reg;
+	    }
+	    if (momentum > 0.0)
+	    {
+	        U_velocity = momentum*U_velocity + U_gradient;
+	        U += learning_rate * U_velocity;
+	    }
+	    else
+	    {
+	        U += learning_rate * U_gradient;
+	    }
+	}
+
+        template <typename DerivedGOut, typename DerivedIn>
+        void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
+				    const MatrixBase<DerivedIn> &fProp_input, 
+				    double learning_rate, double momentum, double L2_reg)
+        {
+            U_gradient.noalias() = bProp_input*fProp_input.transpose();
+
+	    if (L2_reg != 0)
+	    {
+	        U_gradient *= 1 - 2*L2_reg;
+	    }
+
+	    // ignore momentum?
+
+	    U_running_gradient.array() += U_gradient.array().square();
+	    U.array() += learning_rate * U_gradient.array() / U_running_gradient.array().sqrt();
+        }
+
+        template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+        void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
+				  const MatrixBase<DerivedIn> &fProp_input, 
+				  const MatrixBase<DerivedGW> &gradient) const
+        {
+	    UNCONST(DerivedGW, gradient, my_gradient);
+	    my_gradient.noalias() = bProp_input*fProp_input.transpose();
+        }
+};
+
+class Output_word_embeddings
+{
+    private:
+        // row-major is better for uscgemm
+        //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
+        // Having W be a pointer to a matrix allows ease of sharing
+        // input and output word embeddings
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+        std::vector<double> W_data;
+        Matrix<double,Dynamic,1> b;
+        Matrix<double,Dynamic,Dynamic> W_running_gradient;
+        Matrix<double,Dynamic,Dynamic> W_gradient;
+        Matrix<double,Dynamic,1> b_running_gradient;
+        Matrix<double,Dynamic,1> b_gradient;
+
+    public:
+        Output_word_embeddings() { }
+        Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
+
+        void resize(int rows, int cols)
+        {
+	    W->setZero(rows, cols);
+	    b.setZero(rows);
+        }
+    void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+      W = input_W;
+    }
+    void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
+    void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+    void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+    void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
+
+    template <typename Engine>
+    void initialize(Engine &engine, bool init_normal, double init_range, double init_bias)
+    {
+        initMatrix(engine, *W, init_normal, init_range);
+        b.fill(init_bias);
+    }
+
+    int n_inputs () const { return W->cols(); }
+    int n_outputs () const { return W->rows(); }
+
+    template <typename DerivedIn, typename DerivedOut>
+    void fProp(const MatrixBase<DerivedIn> &input,
+    const MatrixBase<DerivedOut> &output) const
+	  {
+        UNCONST(DerivedOut, output, my_output);
+        my_output = ((*W) * input).colwise() + b;
+	  }
+
+	// Sparse output version
+    template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+    void fProp(const MatrixBase<DerivedIn> &input,
+    const MatrixBase<DerivedOutI> &samples,
+    const MatrixBase<DerivedOutV> &output) const
+	  {
+        UNCONST(DerivedOutV, output, my_output);
+        #pragma omp parallel for
+        for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
+            for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+          my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+        USCMatrix<double> sparse_output(W->rows(), samples, my_output);
+        uscgemm_masked(1.0, *W, input, sparse_output);
+        my_output = sparse_output.values; // too bad, so much copying
+	  }
+
+    // Return single element of output matrix
+    template <typename DerivedIn>
+    double fProp(const MatrixBase<DerivedIn> &input, 
+           int word,
+           int instance) const 
+    {
+        return W->row(word).dot(input.col(instance)) + b(word);
+    }
+
+    // Dense versions (for log-likelihood loss)
+
+    template <typename DerivedGOut, typename DerivedGIn>
+    void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
+    const MatrixBase<DerivedGIn> &bProp_matrix) const
+    {
+	    // W is vocab_size x output_embedding_dimension
+	    // input_bProp_matrix is vocab_size x minibatch_size
+	    // bProp_matrix is output_embedding_dimension x minibatch_size
+	    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+	    my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
+        W->transpose() * input_bProp_matrix;
+	  }
+
+    template <typename DerivedIn, typename DerivedGOut>
+          void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+             const MatrixBase<DerivedGOut> &bProp_input,
+             double learning_rate,
+             double momentum) //not sure if we want to use momentum here
+    {
+        // W is vocab_size x output_embedding_dimension
+        // b is vocab_size x 1
+        // predicted_embeddings is output_embedding_dimension x minibatch_size
+        // bProp_input is vocab_size x minibatch_size
+
+        W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
+        b += learning_rate * bProp_input.rowwise().sum();
+	  }
+
+    // Sparse versions
+
+    template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
+    void bProp(const MatrixBase<DerivedGOutI> &samples,
+    const MatrixBase<DerivedGOutV> &weights,
+    const MatrixBase<DerivedGIn> &bProp_matrix) const
+    {
+        UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+        my_bProp_matrix.setZero();
+        uscgemm(1.0,
+            W->transpose(), 
+            USCMatrix<double>(W->rows(), samples, weights),
+            my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
+    }
+
+	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+        void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+			     const MatrixBase<DerivedGOutI> &samples,
+			     const MatrixBase<DerivedGOutV> &weights,
+			     double learning_rate, double momentum) //not sure if we want to use momentum here
+	{
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(learning_rate,
+          gradient_output,
+          predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
+          *W); // narrow predicted_embeddings for possible short minibatch
+	    uscgemv(learning_rate,
+          gradient_output,
+		      Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+          b);
+	}
+
+	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+        void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
+				    const MatrixBase<DerivedGOutI> &samples,
+				    const MatrixBase<DerivedGOutV> &weights,
+				    double learning_rate, double momentum) //not sure if we want to use momentum here
+        {
+	    W_gradient.setZero(W->rows(), W->cols());
+	    b_gradient.setZero(b.size());
+	    if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+	      W_running_gradient.setZero(W->rows(), W->cols());
+	    if (b_running_gradient.size() != b.size())
+	      b_running_gradient.setZero(b.size());
+
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(learning_rate,
+          gradient_output,
+          predicted_embeddings.leftCols(samples.cols()).transpose(),
+          W_gradient);
+	    uscgemv(learning_rate, gradient_output,
+		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          b_gradient);
+
+      int_map update_map; //stores all the parameters that have been updated
+      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+	        for (int train_id=0; train_id<samples.cols(); train_id++)
+		          update_map[samples(sample_id, train_id)] = 1;
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+            update_items.push_back(it->first);
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
+            b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
+            W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
+            b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+        }
+        }
+
+	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
+    void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
+      const MatrixBase<DerivedGOutI> &samples,
+      const MatrixBase<DerivedGOutV> &weights,
+      const MatrixBase<DerivedGW> &gradient_W,
+      const MatrixBase<DerivedGb> &gradient_b) const
+  {
+	    UNCONST(DerivedGW, gradient_W, my_gradient_W);
+	    UNCONST(DerivedGb, gradient_b, my_gradient_b);
+	    my_gradient_W.setZero();
+	    my_gradient_b.setZero();
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(1.0,
+          gradient_output,
+          predicted_embeddings.leftCols(samples.cols()).transpose(),
+          my_gradient_W);
+	    uscgemv(1.0, gradient_output,
+		    Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+  }
+};
+
+class Input_word_embeddings
+{
+    private:
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+        int context_size, vocab_size;
+        Matrix<double,Dynamic,Dynamic> W_running_gradient;
+        Matrix<double,Dynamic,Dynamic> W_gradient;
+
+	friend class model;
+
+    public:
+        Input_word_embeddings() : context_size(0), vocab_size(0) { }
+        Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
+ 
+    void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+      W = input_W;
+    }
+
+        void resize(int rows, int cols, int context)
+        {
+            context_size = context;
+	    vocab_size = rows;
+            W->setZero(rows, cols);
+        }
+
+        void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
+        void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+
+	template <typename Engine>
+	void initialize(Engine &engine, bool init_normal, double init_range)
+        {
+            initMatrix(engine,
+                *W,
+                init_normal,
+                init_range);
+        }
+	
+	int n_inputs() const { return -1; }
+	int n_outputs() const { return W->cols() * context_size; }
+
+	// set output_id's embedding to the weighted average of all embeddings
+	template <typename Dist>
+	void average(const Dist &dist, int output_id)
+	{
+	    W->row(output_id).setZero();
+	    for (int i=0; i < W->rows(); i++)
+	        if (i != output_id)
+		    W->row(output_id) += dist.prob(i) * W->row(i);
+	}
+
+	template <typename DerivedIn, typename DerivedOut>
+        void fProp(const MatrixBase<DerivedIn> &input,
+		   const MatrixBase<DerivedOut> &output) const
+        {
+            int embedding_dimension = W->cols();
+
+	    // W      is vocab_size                        x embedding_dimension
+	    // input  is ngram_size*vocab_size             x minibatch_size
+	    // output is ngram_size*embedding_dimension x minibatch_size
+
+	    /* 
+	    // Dense version:
+	    for (int ngram=0; ngram<context_size; ngram++)
+	        output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
+	    */
+
+	    UNCONST(DerivedOut, output, my_output);
+	    my_output.setZero();
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	        // input might be narrower than expected due to a short minibatch,
+	        // so narrow output to match
+	        uscgemm(1.0,
+            W->transpose(), 
+            USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
+            my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
+	    }
+        }
+
+	// When model is premultiplied, this layer doesn't get used,
+	// but this method is used to get the input into a sparse matrix.
+	// Hopefully this can get eliminated someday
+	template <typename DerivedIn, typename ScalarOut>
+	void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
+	{
+	  output.resize(vocab_size*context_size, context_size, input.cols());
+	  for (int i=0; i < context_size; i++)
+	    output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
+	  output.values.fill(1.0);
+	}
+
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
+     const MatrixBase<DerivedIn> &input_words,
+     double learning_rate, double momentum, double L2_reg)
+  {
+            int embedding_dimension = W->cols();
+
+	    // W           is vocab_size                        x embedding_dimension
+	    // input       is ngram_size*vocab_size             x minibatch_size
+	    // bProp_input is ngram_size*embedding_dimension x minibatch_size
+
+	    /*
+	    // Dense version:
+	    for (int ngram=0; ngram<context_size; ngram++)
+	        W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
+	    */
+
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	        uscgemm(learning_rate, 
+			USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+			bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
+      *W);
+	    }
+  }
+
+    template <typename DerivedGOut, typename DerivedIn>
+    void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+				    const MatrixBase<DerivedIn> &input_words,
+				    double learning_rate, double momentum, double L2_reg)
+    {
+            int embedding_dimension = W->cols();
+
+	    W_gradient.setZero(W->rows(), W->cols());
+	    if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+	        W_running_gradient.setZero(W->rows(), W->cols());
+
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	        uscgemm(learning_rate, 
+			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+      W_gradient);
+	    }
+
+            int_map update_map; //stores all the parameters that have been updated
+
+            for (int train_id=0; train_id<input_words.cols(); train_id++)
+            {
+                update_map[input_words(train_id)] = 1;
+            }
+
+	    // Convert to std::vector for parallelization
+            std::vector<int> update_items;
+            for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+            {
+                update_items.push_back(it->first);
+            }
+            int num_items = update_items.size();
+
+            #pragma omp parallel for
+            for (int item_id=0; item_id<num_items; item_id++)
+            {
+	        int update_item = update_items[item_id];
+                W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
+                W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
+            }
+        }
+
+        template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+        void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+				  const MatrixBase<DerivedIn> &input_words,
+				  int x, int minibatch_size,
+				  const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+        {
+	    UNCONST(DerivedGW, gradient, my_gradient);
+            int embedding_dimension = W->cols();
+	    my_gradient.setZero();
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    uscgemm(1.0, 
+			  USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+			  bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+        my_gradient);
+        }
+};
+
+} // namespace nplm
diff --git a/src/neuralLM.cpp b/src/neuralLM.cpp
new file mode 100644
index 0000000..19d84e8
--- /dev/null
+++ b/src/neuralLM.cpp
@@ -0,0 +1 @@
+#include "neuralLM.h"
diff --git a/src/neuralLM.h b/src/neuralLM.h
new file mode 100644
index 0000000..0c54bfd
--- /dev/null
+++ b/src/neuralLM.h
@@ -0,0 +1,350 @@
+#ifndef NEURALLM_H
+#define NEURALLM_H
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <cctype>
+#include <cstdlib>
+#include <boost/lexical_cast.hpp>
+
+#include <Eigen/Dense>
+
+#include "param.h"
+#include "util.h"
+#include "model.h"
+#include "propagator.h"
+#include "neuralClasses.h"
+#include "vocabulary.h"
+
+namespace nplm
+{
+
+class neuralLM 
+{
+    bool normalization;
+    char map_digits;
+
+    vocabulary input_vocab, output_vocab;
+    model nn;
+    propagator prop;
+
+    int ngram_size;
+    int width;
+
+    double weight;
+
+private:
+    std::size_t cache_size;
+    Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
+    std::vector<double> cache_values;
+    int cache_lookups, cache_hits;
+
+    Eigen::Matrix<int,Eigen::Dynamic,1> ngram; // buffer for lookup_ngram
+    int start, null;
+
+public:
+    neuralLM() 
+      : ngram_size(1), 
+	normalization(false),
+	weight(1.),
+	map_digits(0),
+	width(1),
+	prop(nn, 1),
+        cache_size(0)
+    { 
+    }
+
+    void set_normalization(bool value) { normalization = value; }
+    void set_log_base(double value) { weight = 1./std::log(value); }
+    void set_map_digits(char value) { map_digits = value; }
+
+    // This must be called if the underlying model is resized.
+    void resize() {
+        ngram_size = nn.ngram_size;
+	ngram.setZero(ngram_size);
+	if (cache_size)
+	{
+	  cache_keys.resize(ngram_size, cache_size);
+	  cache_keys.fill(-1);
+	}
+	prop.resize();
+    }
+
+    void set_width(int width)
+    {
+        this->width = width;
+	prop.resize(width);
+    }
+
+    void set_input_vocabulary(const vocabulary &vocab)
+    {
+        this->input_vocab = vocab;
+        start = input_vocab.lookup_word("<s>");
+        null = input_vocab.lookup_word("<null>");
+    }
+
+    void set_output_vocabulary(const vocabulary &vocab)
+    {
+        this->output_vocab = vocab;
+    }
+
+    const vocabulary &get_vocabulary() const { return this->input_vocab; }
+
+    int lookup_input_word(const std::string &word) const
+    {
+        if (map_digits)
+	    for (int i=0; i<word.length(); i++)
+	        if (isdigit(word[i]))
+		{
+		    std::string mapped_word(word);
+		    for (; i<word.length(); i++)
+		        if (isdigit(word[i]))
+			    mapped_word[i] = map_digits;
+		    return input_vocab.lookup_word(mapped_word);
+		}
+        return input_vocab.lookup_word(word);
+    }
+
+    int lookup_word(const std::string &word) const
+    {
+        return lookup_input_word(word);
+    }
+
+    int lookup_output_word(const std::string &word) const
+    {
+        if (map_digits)
+	    for (int i=0; i<word.length(); i++)
+	        if (isdigit(word[i]))
+		{
+		    std::string mapped_word(word);
+		    for (; i<word.length(); i++)
+		        if (isdigit(word[i]))
+			    mapped_word[i] = map_digits;
+		    return output_vocab.lookup_word(mapped_word);
+		}
+	return output_vocab.lookup_word(word);
+    }
+
+    template <typename Derived>
+    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+    {
+	assert (ngram.rows() == ngram_size);
+	assert (ngram.cols() == 1);
+
+	std::size_t hash;
+	if (cache_size)
+	{
+	    // First look in cache
+	    hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
+	    cache_lookups++;
+	    if (cache_keys.col(hash) == ngram)
+	    {
+	        cache_hits++;
+		return cache_values[hash];
+	    }
+	}
+
+	// Make sure that we're single threaded. Multithreading doesn't help,
+	// and in some cases can hurt quite a lot
+	int save_threads = omp_get_max_threads();
+	omp_set_num_threads(1);
+	int save_eigen_threads = Eigen::nbThreads();
+	Eigen::setNbThreads(1);
+	#ifdef __INTEL_MKL__
+	int save_mkl_threads = mkl_get_max_threads();
+	mkl_set_num_threads(1);
+	#endif
+
+        prop.fProp(ngram.col(0));
+
+	int output = ngram(ngram_size-1, 0);
+	double log_prob;
+
+	start_timer(3);
+	if (normalization)
+	{
+	    Eigen::Matrix<double,Eigen::Dynamic,1> scores(output_vocab.size());
+	    prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+	    double logz = logsum(scores.col(0));
+	    log_prob = weight * (scores(output, 0) - logz);
+	}
+	else
+	{
+	    log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
+	}
+	stop_timer(3);
+
+	if (cache_size)
+	{
+	    // Update cache
+	    cache_keys.col(hash) = ngram;
+	    cache_values[hash] = log_prob;
+	}
+
+	#ifdef __INTEL_MKL__
+	mkl_set_num_threads(save_mkl_threads);
+	#endif
+	Eigen::setNbThreads(save_eigen_threads);
+	omp_set_num_threads(save_threads);
+
+	return log_prob;
+    }
+
+    // Look up many n-grams in parallel.
+    template <typename DerivedA, typename DerivedB>
+    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+    {
+        UNCONST(DerivedB, log_probs_const, log_probs);
+	assert (ngram.rows() == ngram_size);
+	assert (ngram.cols() <= width);
+
+        prop.fProp(ngram);
+
+	if (normalization)
+	{
+	    Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(output_vocab.size(), ngram.cols());
+	    prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
+	    // And softmax and loss
+	    Matrix<double,Dynamic,Dynamic> output_probs(nn.output_vocab_size, ngram.cols());
+	    double minibatch_log_likelihood;
+	    SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(nn.ngram_size-1), output_probs, minibatch_log_likelihood);
+	    for (int j=0; j<ngram.cols(); j++)
+	    {
+	        int output = ngram(ngram_size-1, j);
+		log_probs(0, j) = weight * output_probs(output, j);
+	    }
+	}
+	else
+	{
+	    for (int j=0; j<ngram.cols(); j++)
+	    {
+	        int output = ngram(ngram_size-1, j);
+	        log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
+	    }
+	}
+    }
+
+    double lookup_ngram(const int *ngram_a, int n)
+    {
+	for (int i=0; i<ngram_size; i++)
+	{
+	    if (i-ngram_size+n < 0)
+	    {
+		if (ngram_a[0] == start)
+		    ngram(i) = start;
+		else
+		    ngram(i) = null;
+	    }
+	    else
+	    {
+	        ngram(i) = ngram_a[i-ngram_size+n];
+	    }
+	}
+	return lookup_ngram(ngram);
+    }
+
+    double lookup_ngram(const std::vector<int> &ngram_v)
+    {
+        return lookup_ngram(ngram_v.data(), ngram_v.size());
+    }
+
+    int get_order() const { return ngram_size; }
+
+    void read(const std::string &filename)
+    {
+        std::vector<std::string> input_words;
+        std::vector<std::string> output_words;
+        nn.read(filename, input_words, output_words);
+        set_input_vocabulary(vocabulary(input_words));
+        set_output_vocabulary(vocabulary(output_words));
+        resize();
+	// this is faster but takes more memory
+        //nn.premultiply();
+    }
+
+    void set_cache(std::size_t cache_size)
+    {
+        this->cache_size = cache_size;
+	cache_keys.resize(ngram_size, cache_size);
+	cache_keys.fill(-1); // clears cache
+	cache_values.resize(cache_size);
+	cache_lookups = cache_hits = 0;
+    }
+
+    double cache_hit_rate()
+    {
+        return static_cast<double>(cache_hits)/cache_lookups;
+    }
+
+};
+
+template <typename T>
+void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop)
+{
+    output.clear();
+    output.resize(input.size()+ngram_size);
+    for (int i=0; i<ngram_size-1; i++)
+        output[i] = start;
+    std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
+    output[output.size()-1] = stop;
+}
+
+template <typename T>
+void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &output, int ngram_size)
+{
+  output.clear();
+  for (int j=ngram_size-1; j<input.size(); j++)
+  {
+      std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
+      output.push_back(ngram);
+  }
+}
+
+inline void preprocessWords(const std::vector<std::string> &words, std::vector< std::vector<int> > &ngrams,
+			    int ngram_size, const vocabulary &vocab, 
+			    bool numberize, bool add_start_stop, bool ngramize)
+{
+  int start = vocab.lookup_word("<s>");
+  int stop = vocab.lookup_word("</s>");
+  
+  // convert words to ints
+  std::vector<int> nums;
+  if (numberize) {
+    for (int j=0; j<words.size(); j++) {
+      nums.push_back(vocab.lookup_word(words[j]));
+    }
+  }
+  else {
+    for (int j=0; j<words.size(); j++) {
+      nums.push_back(boost::lexical_cast<int>(words[j]));
+    }            
+  }
+  
+  // convert sequence to n-grams
+  ngrams.clear();
+  if (ngramize) {
+    std::vector<int> snums;
+    if (add_start_stop) {
+      addStartStop<int>(nums, snums, ngram_size, start, stop);
+    } else {
+      snums = nums;
+    }
+    makeNgrams(snums, ngrams, ngram_size);
+  }
+  else {
+    if (nums.size() != ngram_size)
+      {
+	std::cerr << "error: wrong number of fields in line" << std::endl;
+	std::exit(1);
+      }
+    ngrams.push_back(nums);
+  }
+}
+
+} // namespace nplm
+
+#endif
diff --git a/src/param.h b/src/param.h
new file mode 100644
index 0000000..b303514
--- /dev/null
+++ b/src/param.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <string>
+
+namespace nplm
+{
+
+struct param 
+{
+    std::string train_file;
+    std::string validation_file;
+    std::string test_file;
+
+    std::string model_file;
+
+    std::string unigram_probs_file;
+    std::string words_file;
+    std::string input_words_file;
+    std::string output_words_file;
+    std::string model_prefix;
+
+    int ngram_size;
+    int vocab_size;
+    int input_vocab_size;
+    int output_vocab_size;
+    int num_hidden;
+    int embedding_dimension;
+    int input_embedding_dimension;
+    int output_embedding_dimension;
+    std::string activation_function;
+    std::string loss_function;
+
+    int minibatch_size;
+    int validation_minibatch_size;
+    int num_epochs;
+    double learning_rate;
+
+    bool init_normal;
+    double init_range;
+
+    int num_noise_samples;
+
+    bool use_momentum;
+    double initial_momentum;
+    double final_momentum;
+
+    double L2_reg;
+
+    bool normalization;
+    double normalization_init;
+
+    int num_threads;
+  
+    bool share_embeddings;
+
+};
+
+} // namespace nplm
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
new file mode 100644
index 0000000..94482d0
--- /dev/null
+++ b/src/prepareNeuralLM.cpp
@@ -0,0 +1,246 @@
+#include <iostream>
+#include <vector>
+#include <queue>
+#include <boost/unordered_map.hpp>
+#include <tclap/CmdLine.h>
+#include <boost/algorithm/string/join.hpp>
+
+#include "neuralLM.h"
+#include "util.h"
+
+using namespace std;
+using namespace TCLAP;
+using namespace boost;
+using namespace nplm;
+
+void writeNgrams(const vector<vector<string> > &data, 
+		 int ngram_size, const vocabulary &vocab, 
+		 bool numberize, bool add_start_stop, bool ngramize, 
+		 const string &filename)
+{
+    ofstream file(filename.c_str());
+    if (!file)
+    {
+	cerr << "error: could not open " << filename << endl;
+	exit(1);
+    }
+
+    vector<vector<int> > ngrams;
+    for (int i=0; i<data.size(); i++) {
+        preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
+	// write out n-grams
+	for (int j=0; j<ngrams.size(); j++)
+	  {
+	    for (int k=0; k<ngram_size; k++)
+	      {
+	        file << ngrams[j][k] << " ";
+	      }
+	    file << endl;
+	  }
+    }
+    file.close();
+}
+
+int main(int argc, char *argv[])
+{
+    int ngram_size, vocab_size, validation_size;
+    bool numberize, ngramize, add_start_stop;
+    string train_text, train_file, validation_text, validation_file, words_file, write_words_file;
+
+    try
+    {
+	CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+	// The options are printed in reverse order
+
+    ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
+
+    ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
+    ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
+    ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
+	ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
+    ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
+	ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+	ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+	ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+
+	cmd.parse(argc, argv);
+
+	train_text = arg_train_text.getValue();
+	train_file = arg_train_file.getValue();
+	validation_text = arg_validation_text.getValue();
+	validation_file = arg_validation_file.getValue();
+	validation_size = arg_validation_size.getValue();
+	write_words_file = arg_write_words_file.getValue();
+	ngram_size = arg_ngram_size.getValue();
+	vocab_size = arg_vocab_size.getValue();
+	words_file = arg_words_file.getValue();
+	numberize = arg_numberize.getValue();
+	ngramize = arg_ngramize.getValue();
+	add_start_stop = arg_add_start_stop.getValue();
+
+    // check command line arguments
+
+    // Notes:
+    // - either --words_file or --vocab_size is required.
+    // - if --words_file is set,
+    // - if --vocab_size is not set, it is inferred from the length of the file
+    // - if --vocab_size is set, it is an error if the vocab file has a different number of lines
+    // - if --numberize 0 is set and --words_file f is not set, then the output model file will not have a vocabulary, and a warning should be printed.
+
+    // Notes:
+    // - if --ngramize 0 is set, then
+    // - if --ngram_size is not set, it is inferred from the training file (different from current)
+    // - if --ngram_size is set, it is an error if the training file has a different n-gram size
+    // - if neither --validation_file or --validation_size is set, validation will not be performed.
+    // - if --numberize 0 is set, then --validation_size cannot be used.
+
+    cerr << "Command line: " << endl;
+    cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
+	
+	const string sep(" Value: ");
+	cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
+	cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+	cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
+	cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+	cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+	cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
+	cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+	cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
+	cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
+	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    }
+    catch (TCLAP::ArgException &e)
+    {
+      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+      exit(1);
+    }
+
+    // VLF: why is this true?
+    // DC: it's because the vocabulary has to be constructed from the training data only.
+    // If the vocabulary is preset, we can't create the validation data.
+    // - if --numberize 0 is set, then --validation_size cannot be used.
+    // if (!numberize && (validation_size > 0)) {
+    //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
+    // }
+
+    // Read in training data and validation data
+    vector<vector<string> > train_data;
+    readSentFile(train_text, train_data);
+    for (int i=0; i<train_data.size(); i++) {
+        // if data is already ngramized, set/check ngram_size
+        if (!ngramize) {
+            if (ngram_size > 0) {
+                if (ngram_size != train_data[i].size()) {
+                    cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+                }
+            }
+            // else if --ngram_size has not been specified, set it now
+            else {
+                ngram_size=train_data[i].size();
+            }
+        }
+    }
+    
+    vector<vector<string> > validation_data;
+    if (validation_text != "") {
+        readSentFile(validation_text, validation_data);
+        for (int i=0; i<validation_data.size(); i++) {
+	    // if data is already ngramized, set/check ngram_size
+            if (!ngramize) {
+                // if --ngram_size has been specified, check that it does not conflict with --ngram_size
+                if (ngram_size > 0) {
+                    if (ngram_size != validation_data[i].size()) {
+                        cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
+                    }
+                }
+                // else if --ngram_size has not been specified, set it now
+                else {
+                    ngram_size=validation_data[i].size();
+                }
+            }
+        }
+    }
+    else if (validation_size > 0)
+    {
+        // Create validation data
+        if (validation_size > train_data.size())
+	{
+	    cerr << "error: requested validation size is greater than training data size" << endl;
+	    exit(1);
+	}
+	validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+	train_data.resize(train_data.size() - validation_size);
+    }
+
+    // Construct vocabulary
+    vocabulary vocab;
+    int start, stop;
+
+    // read vocabulary from file
+    if (words_file != "") {
+        vector<string> words;
+        readWordsFile(words_file,words);
+        for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+            vocab.insert_word(*it);
+        }
+
+        // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+        if (vocab_size > 0) {
+            if (vocab.size() != vocab_size) {
+                cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
+            }
+        }
+        // else, set it to the size of vocabulary read from file
+        else {
+            vocab_size = vocab.size();
+        }
+
+    }
+
+    // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
+    else {
+        vocab.insert_word("<s>");
+	vocab.insert_word("</s>");
+	vocab.insert_word("<null>");
+
+        // warn user that if --numberize is not set, there will be no vocabulary!
+        if (!numberize) {
+            cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+        }
+        unordered_map<string,int> count;
+        for (int i=0; i<train_data.size(); i++) {
+            for (int j=0; j<train_data[i].size(); j++) {
+                count[train_data[i][j]] += 1; 
+            }
+        }
+
+        vocab.insert_most_frequent(count, vocab_size);
+        if (vocab.size() < vocab_size) {
+            cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+        }
+    }
+
+    // write vocabulary to file
+    if (write_words_file != "") {
+        cerr << "Writing vocabulary to " << write_words_file << endl;
+        writeWordsFile(vocab.words(), write_words_file);
+    }
+
+    // Write out numberized n-grams
+    if (train_file != "")
+    {
+        cerr << "Writing training data to " << train_file << endl;
+        writeNgrams(train_data, ngram_size, vocab, numberize, add_start_stop, ngramize, train_file);
+    }
+    if (validation_file != "")
+    {
+        cerr << "Writing validation data to " << validation_file << endl;
+        writeNgrams(validation_data, ngram_size, vocab, numberize, add_start_stop, ngramize, validation_file);
+    }
+}
diff --git a/src/prepareNeuralTM.cpp b/src/prepareNeuralTM.cpp
new file mode 100644
index 0000000..8d7cbf8
--- /dev/null
+++ b/src/prepareNeuralTM.cpp
@@ -0,0 +1,396 @@
+#include <iostream>
+#include <vector>
+#include <queue>
+#include <boost/unordered_map.hpp>
+#include <tclap/CmdLine.h>
+#include <boost/algorithm/string/join.hpp>
+
+using namespace std;
+using namespace TCLAP;
+
+#include "neuralLM.h" // for vocabulary
+#include "util.h"
+
+using namespace boost;
+using namespace nplm;
+
+void writeNgrams(const vector<vector<string> > &input_data, const vector<vector<string> > &output_data, int ngram_size, const vocabulary &input_vocab, const vocabulary &output_vocab, bool numberize, bool ngramize, const string &filename)
+{
+    ofstream file(filename.c_str());
+    if (!file)
+    {
+	cerr << "error: could not open " << filename << endl;
+	exit(1);
+    }
+
+    // check that input and output data have the same number of sentences
+    if (input_data.size() != output_data.size()) {
+        cerr << "Error: input and output data files have different number of lines" << endl;
+        exit(1);
+    }
+
+    // for each input and output line
+    int lines=input_data.size();
+    if (numberize) {
+        for (int i=0; i<lines; i++) {
+            // convert each line to a set of ngrams
+            vector<vector<int> > input_ngrams;
+            vector<int> input_nums;
+            for (int j=0; j<input_data[i].size(); j++) {
+                input_nums.push_back(input_vocab.lookup_word(input_data[i][j]));
+            }
+            makeNgrams(input_nums, input_ngrams, ngram_size-1);
+            
+            vector<vector<int> > output_ngrams;
+            vector<int> output_nums;
+            for (int j=0; j<output_data[i].size(); j++) {
+                output_nums.push_back(output_vocab.lookup_word(output_data[i][j]));
+            }
+            makeNgrams(output_nums, output_ngrams, 1);
+    
+            // print out cross product of input and output ngrams
+            for (int j=0; j < input_ngrams.size(); j++) {
+                for (int k=0; k < output_ngrams.size(); k++) {
+                    int j_prime;
+                    for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) {
+                        file << input_ngrams[j][j_prime] << " ";
+                    }
+                    file << input_ngrams[j][j_prime];
+                    int k_prime;
+                    for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) {
+                        file << " " << output_ngrams[k][k_prime];
+                    }
+                    file << endl;
+                }
+            }
+        }
+    }
+
+    else {
+        for (int i=0; i<lines; i++) {
+            // convert each line to a set of ngrams
+            vector<vector<string> > input_ngrams;
+            vector<string> input_words;
+            for (int j=0; j<input_data[i].size(); j++) {
+                int unk = input_vocab.lookup_word("<unk>");
+                // if word is unknown
+                if (input_vocab.lookup_word(input_data[i][j]) == unk) {
+                    input_words.push_back("<unk>");
+                }
+                // if word is known
+                else {
+                    input_words.push_back(input_data[i][j]);
+                }
+            }
+            makeNgrams(input_words, input_ngrams, ngram_size-1);
+            
+            vector<vector<string> > output_ngrams;
+            vector<string> output_words;
+            for (int j=0; j<output_data[i].size(); j++) {
+                int unk = output_vocab.lookup_word("<unk>");
+                // if word is unknown
+                if (output_vocab.lookup_word(output_data[i][j]) == unk) {
+                    output_words.push_back("<unk>");
+                }
+                // if word is known
+                else {
+                    output_words.push_back(output_data[i][j]);
+                }
+            }
+            makeNgrams(output_words, output_ngrams, 1);
+    
+            // print out cross product of input and output ngrams
+            for (int j=0; j < input_ngrams.size(); j++) {
+                for (int k=0; k < output_ngrams.size(); k++) {
+                    int j_prime;
+                    for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) {
+                        file << input_ngrams[j][j_prime] << " ";
+                    }
+                    file << input_ngrams[j][j_prime];
+                    int k_prime;
+                    for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) {
+                        file << " " << output_ngrams[k][k_prime];
+                    }
+                    file << endl;
+                }
+            }
+        }
+    }
+    file.close();
+}
+    
+int main(int argc, char *argv[])
+{
+    int ngram_size, input_vocab_size, output_vocab_size, validation_size;
+    bool add_start_stop, numberize, ngramize;
+    string input_train_text, output_train_text, train_file, input_validation_text, output_validation_text, validation_file, write_input_words_file, write_output_words_file, input_words_file, output_words_file;
+
+    try
+    {
+	CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+	// The options are printed in reverse order
+    
+    ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend (ngram_size-1) start symbols and postpend 1 stop symbol. Default: true.", false, true, "bool", cmd);
+    ValueArg<int> arg_input_vocab_size("", "input_vocab_size", "Vocabulary size.", false, -1, "int", cmd);
+    ValueArg<int> arg_output_vocab_size("", "output_vocab_size", "Vocabulary size.", false, -1, "int", cmd);
+    ValueArg<string> arg_input_words_file("", "input_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
+    ValueArg<string> arg_output_words_file("", "output_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
+    ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
+	ValueArg<string> arg_write_input_words_file("", "write_input_words_file", "Output vocabulary.", false, "", "string", cmd);
+	ValueArg<string> arg_write_output_words_file("", "write_output_words_file", "Output vocabulary.", false, "", "string", cmd);
+    ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
+	ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+	ValueArg<string> arg_input_validation_text("", "input_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+	ValueArg<string> arg_output_validation_text("", "output_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_input_train_text("", "input_train_text", "Input training data (tokenized).", true, "", "string", cmd);
+    ValueArg<string> arg_output_train_text("", "output_train_text", "Input training data (tokenized).", true, "", "string", cmd);
+
+	cmd.parse(argc, argv);
+
+	input_train_text = arg_input_train_text.getValue();
+	output_train_text = arg_output_train_text.getValue();
+	train_file = arg_train_file.getValue();
+	validation_file = arg_validation_file.getValue();
+	input_validation_text = arg_input_validation_text.getValue();
+	output_validation_text = arg_output_validation_text.getValue();
+	input_validation_text = arg_input_validation_text.getValue();
+	output_validation_text = arg_output_validation_text.getValue();
+	validation_size = arg_validation_size.getValue();
+	write_input_words_file = arg_write_input_words_file.getValue();
+	write_output_words_file = arg_write_output_words_file.getValue();
+	ngram_size = arg_ngram_size.getValue();
+	input_vocab_size = arg_input_vocab_size.getValue();
+	output_vocab_size = arg_output_vocab_size.getValue();
+	input_words_file = arg_input_words_file.getValue();
+	output_words_file = arg_output_words_file.getValue();
+	numberize = arg_numberize.getValue();
+	ngramize = arg_ngramize.getValue();
+	add_start_stop = arg_add_start_stop.getValue();
+
+    // check command line arguments
+
+    // Notes:
+    // - either --words_file or --vocab_size is required.
+    // - if --words_file is set,
+    // - if --vocab_size is not set, it is inferred from the length of the file
+    // - if --vocab_size is set, it is an error if the vocab file has a different number of lines
+    // - if --numberize 0 is set and --use_vocab f is not set, then the output model file will not have a vocabulary, and a warning should be printed.
+    if ((input_words_file == "") && (input_vocab_size == -1)) {
+        cerr << "Error: either --input_words_file or --input_vocab_size is required." << endl;
+        exit(1);
+    }
+    if ((output_words_file == "") && (output_vocab_size == -1)) {
+        cerr << "Error: either --output_words_file or --output_vocab_size is required." << endl;
+        exit(1);
+    }
+
+    // Notes:
+    // - if --ngramize 0 is set, then
+    // - if --ngram_size is not set, it is inferred from the training file (different from current)
+    // - if --ngram_size is set, it is an error if the training file has a different n-gram size
+    // - if neither --validation_file or --validation_size is set, validation will not be performed.
+    // - if --numberize 0 is set, then --validation_size cannot be used.
+
+    cerr << "Command line: " << endl;
+    cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
+	
+	const string sep(" Value: ");
+	cerr << arg_input_train_text.getDescription() << sep << arg_input_train_text.getValue() << endl;
+	cerr << arg_output_train_text.getDescription() << sep << arg_output_train_text.getValue() << endl;
+	cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+	cerr << arg_input_validation_text.getDescription() << sep << arg_input_validation_text.getValue() << endl;
+	cerr << arg_output_validation_text.getDescription() << sep << arg_output_validation_text.getValue() << endl;
+	cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+	cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+	cerr << arg_write_input_words_file.getDescription() << sep << arg_write_input_words_file.getValue() << endl;
+	cerr << arg_write_output_words_file.getDescription() << sep << arg_write_output_words_file.getValue() << endl;
+	cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+	cerr << arg_input_vocab_size.getDescription() << sep << arg_input_vocab_size.getValue() << endl;
+	cerr << arg_output_vocab_size.getDescription() << sep << arg_output_vocab_size.getValue() << endl;
+	cerr << arg_input_words_file.getDescription() << sep << arg_input_words_file.getValue() << endl;
+	cerr << arg_output_words_file.getDescription() << sep << arg_output_words_file.getValue() << endl;
+	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    }
+    catch (TCLAP::ArgException &e)
+    {
+      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+      exit(1);
+    }
+
+    // Read in input training data and validation data
+    vector<vector<string> > input_train_data;
+    readSentFile(input_train_text, input_train_data);
+    if (add_start_stop) {
+      for (int i=0; i<input_train_data.size(); i++) {
+	vector<string> input_train_data_start_stop;
+	addStartStop<string>(input_train_data[i], input_train_data_start_stop, ngram_size, "<s>", "</s>");
+	input_train_data[i]=input_train_data_start_stop;
+      }
+    }
+    
+    vector<vector<string> > input_validation_data;
+    if (input_validation_text != "") {
+        readSentFile(input_validation_text, input_validation_data);
+        if (add_start_stop) {
+	  for (int i=0; i<input_validation_data.size(); i++) {
+	    vector<string> input_validation_data_start_stop;
+	    addStartStop<string>(input_validation_data[i], input_validation_data_start_stop, ngram_size, "<s>", "</s>");
+	    input_validation_data[i]=input_validation_data_start_stop;
+	  }
+        }
+    }
+    else if (validation_size > 0)
+    {
+        if (validation_size > input_train_data.size())
+	{
+	    cerr << "error: requested input_validation size is greater than training data size" << endl;
+	    exit(1);
+	}
+	input_validation_data.insert(input_validation_data.end(), input_train_data.end()-validation_size, input_train_data.end());
+	input_train_data.resize(input_train_data.size() - validation_size);
+    }
+
+    // Read in output training data and validation data
+    vector<vector<string> > output_train_data;
+    readSentFile(output_train_text, output_train_data);
+    if (add_start_stop) {
+      for (int i=0; i<output_train_data.size(); i++) {
+	vector<string> output_train_data_start_stop;
+	addStartStop<string>(output_train_data[i], output_train_data_start_stop, 1, "<s>", "</s>");
+	output_train_data[i]=output_train_data_start_stop;
+      }
+    }
+    
+    vector<vector<string> > output_validation_data;
+    if (output_validation_text != "") {
+        readSentFile(output_validation_text, output_validation_data);
+        if (add_start_stop) {
+	  for (int i=0; i<output_validation_data.size(); i++) {
+	    vector<string> output_validation_data_start_stop;
+	    addStartStop<string>(output_validation_data[i], output_validation_data_start_stop, 1, "<s>", "</s>");
+	    output_validation_data[i]=output_validation_data_start_stop;
+	  }
+        }
+    }
+    else if (validation_size > 0)
+    {
+        if (validation_size > output_train_data.size())
+	{
+	    cerr << "error: requested output_validation size is greater than training data size" << endl;
+	    exit(1);
+	}
+	output_validation_data.insert(output_validation_data.end(), output_train_data.end()-validation_size, output_train_data.end());
+	output_train_data.resize(output_train_data.size() - validation_size);
+    }
+
+    // Construct input vocabulary
+    vocabulary input_vocab;
+    int input_start = input_vocab.insert_word("<s>");
+    int input_stop = input_vocab.insert_word("</s>");
+    input_vocab.insert_word("<null>");
+
+    // read input vocabulary from file
+    if (input_words_file != "") {
+        vector<string> words;
+        readWordsFile(input_words_file,words);
+        for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+            input_vocab.insert_word(*it);
+        }
+        // was input_vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+        if (input_vocab_size > 0) {
+            if (input_vocab.size() != input_vocab_size) {
+                cerr << "Error: size of input_vocabulary file " << input_vocab.size() << " != --input_vocab_size " << input_vocab_size << endl;
+            }
+        }
+        // else, set it to the size of vocabulary read from file
+        else {
+            input_vocab_size = input_vocab.size();
+        }
+    }
+
+    // or construct input vocabulary to contain top <input_vocab_size> most frequent words; all other words replaced by <unk>
+    else {
+        unordered_map<string,int> count;
+        for (int i=0; i<input_train_data.size(); i++) {
+            for (int j=0; j<input_train_data[i].size(); j++) {
+                count[input_train_data[i][j]] += 1; 
+            }
+        }
+
+        input_vocab.insert_most_frequent(count, input_vocab_size);
+        if (input_vocab.size() < input_vocab_size) {
+            cerr << "warning: fewer than " << input_vocab_size << " types in training data; the unknown word will not be learned" << endl;
+        }
+    }
+
+    // Construct output vocabulary
+    vocabulary output_vocab;
+    int output_start = output_vocab.insert_word("<s>");
+    int output_stop = output_vocab.insert_word("</s>");
+    output_vocab.insert_word("<null>");
+
+    // read output vocabulary from file
+    if (output_words_file != "") {
+        vector<string> words;
+        readWordsFile(output_words_file,words);
+        for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+            output_vocab.insert_word(*it);
+        }
+        // was output_vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+        if (output_vocab_size > 0) {
+            if (output_vocab.size() != output_vocab_size) {
+                cerr << "Error: size of output_vocabulary file " << output_vocab.size() << " != --output_vocab_size " << output_vocab_size << endl;
+            }
+        }
+        // else, set it to the size of vocabulary read from file
+        else {
+            output_vocab_size = output_vocab.size();
+        }
+    }
+
+    // or construct output vocabulary to contain top <output_vocab_size> most frequent words; all other words replaced by <unk>
+    else {
+        unordered_map<string,int> count;
+        for (int i=0; i<output_train_data.size(); i++) {
+            for (int j=0; j<output_train_data[i].size(); j++) {
+                count[output_train_data[i][j]] += 1; 
+            }
+        }
+
+        output_vocab.insert_most_frequent(count, output_vocab_size);
+        if (output_vocab.size() < output_vocab_size) {
+            cerr << "warning: fewer than " << output_vocab_size << " types in training data; the unknown word will not be learned" << endl;
+        }
+    }
+
+    // write input vocabulary to file
+    if (write_input_words_file != "") {
+        cerr << "Writing vocabulary to " << write_input_words_file << endl;
+        writeWordsFile(input_vocab.words(), write_input_words_file);
+    }
+
+    // write output vocabulary to file
+    if (write_output_words_file != "") {
+        cerr << "Writing vocabulary to " << write_output_words_file << endl;
+        writeWordsFile(output_vocab.words(), write_output_words_file);
+    }
+
+    // Write out input and output numberized n-grams
+    if (train_file != "")
+    {
+        cerr << "Writing training data to " << train_file << endl;
+        writeNgrams(input_train_data, output_train_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, train_file);
+
+    }
+    if (validation_file != "")
+    {
+        cerr << "Writing validation data to " << validation_file << endl;
+        writeNgrams(input_validation_data, output_validation_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, validation_file);
+    }
+}
diff --git a/src/propagator.h b/src/propagator.h
new file mode 100644
index 0000000..b79e84a
--- /dev/null
+++ b/src/propagator.h
@@ -0,0 +1,194 @@
+#ifndef NETWORK_H
+#define NETWORK_H
+
+#include "neuralClasses.h"
+#include "util.h"
+
+namespace nplm
+{
+
+// is this cheating?
+using Eigen::Matrix;
+using Eigen::MatrixBase;
+using Eigen::Dynamic;
+
+class propagator {
+    int minibatch_size;
+    model *pnn;
+
+public:
+    Node<Input_word_embeddings> input_layer_node;
+    Node<Linear_layer> first_hidden_linear_node;
+    Node<Activation_function> first_hidden_activation_node;
+    Node<Linear_layer> second_hidden_linear_node;
+    Node<Activation_function> second_hidden_activation_node;
+    Node<Output_word_embeddings> output_layer_node;
+
+public:
+    propagator () : minibatch_size(0), pnn(0) { }
+
+    propagator (model &nn, int minibatch_size)
+      :
+        pnn(&nn),
+        input_layer_node(&nn.input_layer, minibatch_size),
+	first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
+	first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
+        second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
+	second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
+	output_layer_node(&nn.output_layer, minibatch_size),
+	minibatch_size(minibatch_size)
+    {
+    }
+
+    // This must be called if the underlying model is resized.
+    void resize(int minibatch_size) {
+      this->minibatch_size = minibatch_size;
+      input_layer_node.resize(minibatch_size);
+      first_hidden_linear_node.resize(minibatch_size);
+      first_hidden_activation_node.resize(minibatch_size);
+      second_hidden_linear_node.resize(minibatch_size);
+      second_hidden_activation_node.resize(minibatch_size);
+      output_layer_node.resize(minibatch_size);
+    }
+
+    void resize() { resize(minibatch_size); }
+
+    template <typename Derived>
+    void fProp(const MatrixBase<Derived> &data)
+    {
+        if (!pnn->premultiplied)
+	{
+            start_timer(0);
+	    input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
+	    stop_timer(0);
+	    
+	    start_timer(1);
+	    first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, 
+						  first_hidden_linear_node.fProp_matrix);
+	} 
+	else
+	{
+	    int n_inputs = first_hidden_linear_node.param->n_inputs();
+	    USCMatrix<double> sparse_data;
+	    input_layer_node.param->munge(data, sparse_data);
+
+	    start_timer(1);
+	    first_hidden_linear_node.param->fProp(sparse_data,
+						  first_hidden_linear_node.fProp_matrix);
+	}
+	first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
+						  first_hidden_activation_node.fProp_matrix);
+	stop_timer(1);
+    
+
+	start_timer(2);
+	second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
+					       second_hidden_linear_node.fProp_matrix);
+	second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
+						   second_hidden_activation_node.fProp_matrix);
+	stop_timer(2);
+
+	// The propagation stops here because the last layer is very expensive.
+    }
+
+    // Dense version (for standard log-likelihood)
+    template <typename DerivedIn, typename DerivedOut>
+    void bProp(const MatrixBase<DerivedIn> &data,
+	       const MatrixBase<DerivedOut> &output,
+	       double learning_rate, double momentum, double L2_reg) 
+    {
+        // Output embedding layer
+
+        start_timer(7);
+        output_layer_node.param->bProp(output,
+				       output_layer_node.bProp_matrix);
+	stop_timer(7);
+	
+	start_timer(8);
+	output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
+						 output,
+						 learning_rate, momentum);
+	stop_timer(8);
+
+	bPropRest(data, learning_rate, momentum, L2_reg);
+    }
+
+    // Sparse version (for NCE log-likelihood)
+    template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+    void bProp(const MatrixBase<DerivedIn> &data,
+	       const MatrixBase<DerivedOutI> &samples, const MatrixBase<DerivedOutV> &weights,
+	       double learning_rate, double momentum, double L2_reg) 
+    {
+
+        // Output embedding layer
+
+        start_timer(7);
+        output_layer_node.param->bProp(samples, weights, 
+				       output_layer_node.bProp_matrix);
+	stop_timer(7);
+	
+
+	start_timer(8);
+	output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
+						 samples, weights,
+						 learning_rate, momentum);
+	stop_timer(8);
+
+	bPropRest(data, learning_rate, momentum, L2_reg);
+    }
+
+private:
+    template <typename DerivedIn>
+    void bPropRest(const MatrixBase<DerivedIn> &data,
+		   double learning_rate, double momentum, double L2_reg) 
+    {
+	// Second hidden layer
+
+        start_timer(9);
+	second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+						   second_hidden_activation_node.bProp_matrix,
+						   second_hidden_linear_node.fProp_matrix,
+						   second_hidden_activation_node.fProp_matrix);
+
+	second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
+					       second_hidden_linear_node.bProp_matrix);
+	stop_timer(9);
+
+	start_timer(10);
+	second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
+							 first_hidden_activation_node.fProp_matrix,
+							 learning_rate, momentum, L2_reg);
+	stop_timer(10);
+
+	// First hidden layer
+
+	start_timer(11);
+	first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
+						  first_hidden_activation_node.bProp_matrix,
+						  first_hidden_linear_node.fProp_matrix,
+						  first_hidden_activation_node.fProp_matrix);
+
+	first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+					      first_hidden_linear_node.bProp_matrix);
+	stop_timer(11);
+	
+	start_timer(12);
+	first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
+							input_layer_node.fProp_matrix,
+							learning_rate, momentum, L2_reg);
+	stop_timer(12);
+
+	// Input word embeddings
+	
+	start_timer(13);
+	input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
+						data,
+						learning_rate, momentum, L2_reg);
+	stop_timer(13);
+
+    }
+};
+
+} // namespace nplm
+
+#endif
diff --git a/src/python/nplm.pxd b/src/python/nplm.pxd
new file mode 100644
index 0000000..5cbead7
--- /dev/null
+++ b/src/python/nplm.pxd
@@ -0,0 +1,23 @@
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+
+cdef extern from "neuralLM.h":
+    cdef cppclass c_neuralLM "nplm::neuralLM":
+        c_neuralLM()
+        void set_normalization(bint)
+        void set_map_digits(char)
+        void set_log_base(double)
+        void read(string filename) except +
+        int get_order()
+        int lookup_word(string)
+        float lookup_ngram(vector[int])
+        float lookup_ngram(int *, int)
+        void set_cache(int)
+        double cache_hit_rate()
+
+cdef class NeuralLM:
+    cdef c_neuralLM *thisptr
+    cdef int c_lookup_word(self, char *s)
+    cdef float c_lookup_ngram(self, int *words, int n)
+    cdef readonly int order
+    
diff --git a/src/python/nplm.pyx b/src/python/nplm.pyx
new file mode 100644
index 0000000..290d56c
--- /dev/null
+++ b/src/python/nplm.pyx
@@ -0,0 +1,38 @@
+# distutils: language = c++
+
+cdef class NeuralLM:
+    def __cinit__(self, normalization=False, map_digits=None, cache_size=0):
+        self.thisptr = new c_neuralLM()
+        self.thisptr.set_normalization(normalization)
+        self.thisptr.set_log_base(10.)
+        if type(map_digits) is str and len(map_digits) == 1:
+            self.thisptr.set_map_digits(map_digits)
+        if cache_size:
+            self.thisptr.set_cache(cache_size)
+
+    def read(self, filename):
+        self.thisptr.read(filename)
+        self.order = self.thisptr.get_order()
+
+    def get_order(self):
+        return self.thisptr.get_order()
+
+    def lookup_word(self, s):
+        return self.thisptr.lookup_word(s)
+    
+    def lookup_ngram(self, words):
+        if len(words) == 0:
+            raise ValueError("ngram is empty")
+        return self.thisptr.lookup_ngram(words)
+
+    def cache_hit_rate(self):
+        return self.thisptr.cache_hit_rate()
+
+    # low-level interface that can be called by other Cython modules
+    cdef int c_lookup_word(self, char *s):
+        cdef string ss
+        ss.assign(s)
+        return self.thisptr.lookup_word(ss)
+
+    cdef float c_lookup_ngram(self, int *words, int n):
+        return self.thisptr.lookup_ngram(words, n)
diff --git a/src/shared/.gitignore b/src/shared/.gitignore
new file mode 100755
index 0000000..e69de29
--- /dev/null
+++ b/src/shared/.gitignore
diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp
new file mode 100644
index 0000000..9ab3766
--- /dev/null
+++ b/src/testNeuralLM.cpp
@@ -0,0 +1,164 @@
+#include <algorithm>
+#include <fstream>
+
+#include <boost/algorithm/string/join.hpp>
+#include <tclap/CmdLine.h>
+
+#include <Eigen/Core>
+#include <Eigen/Dense>
+
+#include "param.h"
+
+#include "neuralLM.h"
+
+using namespace std;
+using namespace boost;
+using namespace TCLAP;
+using namespace Eigen;
+
+using namespace nplm;
+
+int main (int argc, char *argv[]) 
+{
+    param myParam;
+    bool normalization;
+    bool numberize, ngramize, add_start_stop;
+
+    try {
+      // program options //
+      CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
+
+      ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
+      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
+
+      ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
+      ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
+      ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
+
+      ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+
+      ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
+
+      ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
+
+      cmd.parse(argc, argv);
+
+      myParam.model_file = arg_model_file.getValue();
+      myParam.test_file = arg_test_file.getValue();
+
+      normalization = arg_normalization.getValue();
+      numberize = arg_numberize.getValue();
+      ngramize = arg_ngramize.getValue();
+      add_start_stop = arg_add_start_stop.getValue();
+
+      myParam.minibatch_size = minibatch_size.getValue();
+      myParam.num_threads = num_threads.getValue();
+
+      cerr << "Command line: " << endl;
+      cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
+      
+      const string sep(" Value: ");
+      cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
+      cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
+
+      cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
+      cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+      cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+
+      cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
+      cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
+    }
+    catch (TCLAP::ArgException &e)
+    {
+      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+      exit(1);
+    }
+
+    myParam.num_threads = setup_threads(myParam.num_threads);
+
+    ///// Create language model
+
+    neuralLM lm;
+    lm.read(myParam.model_file);
+    lm.set_normalization(normalization);
+    lm.set_log_base(10);
+    lm.set_cache(1048576);
+    int ngram_size = lm.get_order();
+    int minibatch_size = myParam.minibatch_size;
+    if (minibatch_size)
+        lm.set_width(minibatch_size);
+
+    ///// Read test data
+
+    double log_likelihood = 0.0;
+
+    ifstream test_file(myParam.test_file.c_str());
+    if (!test_file)
+    {
+	cerr << "error: could not open " << myParam.test_file << endl;
+	exit(1);
+    }
+    string line;
+
+    vector<int> start;
+    vector<vector<int> > ngrams;
+
+    while (getline(test_file, line))
+    {
+        vector<string> words;
+        splitBySpace(line, words);
+
+	vector<vector<int> > sent_ngrams;
+	preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
+
+	start.push_back(ngrams.size());
+	copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
+    }
+    start.push_back(ngrams.size());
+
+    if (minibatch_size == 0)
+    {
+        // Score one n-gram at a time. This is how the LM would be queried from a decoder.
+        for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+	{	  
+	    double sent_log_prob = 0.0;
+	    for (int j=start[sent_id]; j<start[sent_id+1]; j++) 
+	        sent_log_prob += lm.lookup_ngram(ngrams[j]);
+	    cout << sent_log_prob << endl;
+	    log_likelihood += sent_log_prob;
+	}
+    }
+    else
+    {
+	// Score a whole minibatch at a time.
+        Matrix<double,1,Dynamic> log_probs(ngrams.size());
+
+        Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
+	minibatch.setZero();
+        for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
+	{
+	    int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
+	    for (int j=0; j<current_minibatch_size; j++)
+	        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
+	    lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
+	}
+
+	for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+	{
+	    double sent_log_prob = 0.0;
+	    for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+	        sent_log_prob += log_probs[j];
+	    cout << sent_log_prob << endl;
+	    log_likelihood += sent_log_prob;
+	}
+    }
+    
+    cerr << "Test log10-likelihood: " << log_likelihood << endl;
+    #ifdef USE_CHRONO
+    cerr << "Propagation times:";
+    for (int i=0; i<timer.size(); i++)
+      cerr << " " << timer.get(i);
+    cerr << endl;
+    #endif
+    
+}
diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp
new file mode 100644
index 0000000..f20fff9
--- /dev/null
+++ b/src/testNeuralNetwork.cpp
@@ -0,0 +1,119 @@
+#include <tclap/CmdLine.h>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/lexical_cast.hpp>
+#include <fstream>
+
+#include "model.h"
+#include "propagator.h"
+#include "neuralClasses.h"
+#include "param.h"
+#include "util.h"
+
+using namespace std;
+using namespace boost;
+using namespace TCLAP;
+using namespace Eigen;
+
+using namespace nplm;
+
+int main (int argc, char *argv[]) 
+{
+    param myParam;
+
+    try {
+      // program options //
+      CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
+
+      ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
+      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: 64.", false, 64, "int", cmd);
+
+      ValueArg<string> arg_test_file("", "test_file", "Test file (one numberized example per line).", true, "", "string", cmd);
+
+      ValueArg<string> arg_model_file("", "model_file", "Model file.", true, "", "string", cmd);
+
+      cmd.parse(argc, argv);
+
+      myParam.model_file = arg_model_file.getValue();
+      myParam.test_file = arg_test_file.getValue();
+
+      myParam.num_threads  = num_threads.getValue();
+      myParam.minibatch_size = minibatch_size.getValue();
+
+      cerr << "Command line: " << endl;
+      cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
+      
+      const string sep(" Value: ");
+      cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
+      cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
+    
+      cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
+    }
+    catch (TCLAP::ArgException &e)
+    {
+      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+      exit(1);
+    }
+
+    myParam.num_threads = setup_threads(myParam.num_threads);
+
+    ///// Create network and propagator
+
+    model nn;
+    nn.read(myParam.model_file);
+    myParam.ngram_size = nn.ngram_size;
+    propagator prop(nn, myParam.minibatch_size);
+
+    ///// Set param values according to what was read in from model file
+
+    myParam.ngram_size = nn.ngram_size;
+    myParam.input_vocab_size = nn.input_vocab_size;
+    myParam.output_vocab_size = nn.output_vocab_size;
+    myParam.num_hidden = nn.num_hidden;
+    myParam.input_embedding_dimension = nn.input_embedding_dimension;
+    myParam.output_embedding_dimension = nn.output_embedding_dimension;
+
+    ///// Read test data
+
+    vector<int> test_data_flat;
+    readDataFile(myParam.test_file, myParam.ngram_size, test_data_flat);
+    int test_data_size = test_data_flat.size() / myParam.ngram_size;
+    cerr << "Number of test instances: " << test_data_size << endl;
+
+    Map< Matrix<int,Dynamic,Dynamic> > test_data(test_data_flat.data(), myParam.ngram_size, test_data_size);
+
+    ///// Score test data
+
+    int num_batches = (test_data_size-1)/myParam.minibatch_size + 1;
+    cerr<<"Number of test minibatches: "<<num_batches<<endl;
+
+    double log_likelihood = 0.0;
+    
+    Matrix<double,Dynamic,Dynamic> scores(nn.output_vocab_size, myParam.minibatch_size);
+    Matrix<double,Dynamic,Dynamic> output_probs(nn.output_vocab_size, myParam.minibatch_size);
+
+    for (int batch = 0; batch < num_batches; batch++)
+    {
+	int minibatch_start_index = myParam.minibatch_size * batch;
+	int current_minibatch_size = min(myParam.minibatch_size,
+					 test_data_size - minibatch_start_index);
+	Matrix<int,Dynamic,Dynamic> minibatch = test_data.middleCols(minibatch_start_index, current_minibatch_size);
+	
+	prop.fProp(minibatch.topRows(myParam.ngram_size-1));
+
+	// Do full forward prop through output word embedding layer
+	prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
+	// And softmax and loss
+	double minibatch_log_likelihood;
+	SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
+			       minibatch.row(myParam.ngram_size-1), 
+			       output_probs,
+			       minibatch_log_likelihood);
+	log_likelihood += minibatch_log_likelihood;
+
+	/*for (int i=0; i<current_minibatch_size; i++)
+	  cerr << minibatch.block(0,i,myParam.ngram_size,1) << " " << output_probs(minibatch(myParam.ngram_size-1,i),i) << endl;*/
+	
+    }	
+    cerr << "Test log-likelihood: " << log_likelihood << endl;
+}
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
new file mode 100644
index 0000000..f45bc71
--- /dev/null
+++ b/src/trainNeuralNetwork.cpp
@@ -0,0 +1,584 @@
+#include <ctime>
+#include <cmath>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+
+#include <boost/unordered_map.hpp> 
+#include <boost/functional.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/algorithm/string/join.hpp>
+
+#include <Eigen/Dense>
+#include <Eigen/Sparse>
+#include "maybe_omp.h"
+#include <tclap/CmdLine.h>
+
+#include "model.h"
+#include "propagator.h"
+#include "param.h"
+#include "neuralClasses.h"
+#include "graphClasses.h"
+#include "util.h"
+#include "multinomial.h"
+//#include "gradientCheck.h"
+
+//#define EIGEN_DONT_PARALLELIZE
+
+using namespace std;
+using namespace TCLAP;
+using namespace Eigen;
+using namespace boost;
+using namespace boost::random;
+
+using namespace nplm;
+
+typedef unordered_map<Matrix<int,Dynamic,1>, double> vector_map;
+
+typedef long long int data_size_t; // training data can easily exceed 2G instances
+
+int main(int argc, char** argv)
+{ 
+    param myParam;
+    try {
+      // program options //
+      CmdLine cmd("Trains a two-layer neural probabilistic language model.", ' ' , "0.1");
+
+      // The options are printed in reverse order
+
+      ValueArg<string> unigram_probs_file("", "unigram_probs_file", "Unigram model (deprecated and ignored)." , false, "", "string", cmd);
+
+      ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
+
+      ValueArg<double> final_momentum("", "final_momentum", "Final value of momentum. Default: 0.9.", false, 0.9, "double", cmd);
+      ValueArg<double> initial_momentum("", "initial_momentum", "Initial value of momentum. Default: 0.9.", false, 0.9, "double", cmd);
+      ValueArg<bool> use_momentum("", "use_momentum", "Use momentum (hidden layer weights only). 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+
+      ValueArg<double> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "double", cmd);
+      ValueArg<bool> normalization("", "normalization", "Learn individual normalization factors during training. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+
+      ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 25.", false, 25, "int", cmd);
+
+      ValueArg<double> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "double", cmd);
+
+      ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 0.01.", false, 0.01, "double", cmd);
+
+      ValueArg<int> validation_minibatch_size("", "validation_minibatch_size", "Minibatch size for validation. Default: 64.", false, 64, "int", cmd);
+      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 64.", false, 64, "int", cmd);
+
+      ValueArg<int> num_epochs("", "num_epochs", "Number of epochs. Default: 10.", false, 10, "int", cmd);
+
+      ValueArg<double> init_range("", "init_range", "Maximum (of uniform) or standard deviation (of normal) for initialization. Default: 0.01", false, 0.01, "double", cmd);
+      ValueArg<bool> init_normal("", "init_normal", "Initialize parameters from a normal distribution. 1 = normal, 0 = uniform. Default: 0.", false, 0, "bool", cmd);
+
+      ValueArg<string> loss_function("", "loss_function", "Loss function (log, nce). Default: nce.", false, "nce", "string", cmd);
+      ValueArg<string> activation_function("", "activation_function", "Activation function (identity, rectifier, tanh, hardtanh). Default: rectifier.", false, "rectifier", "string", cmd);
+      ValueArg<int> num_hidden("", "num_hidden", "Number of hidden nodes. Default: 100.", false, 100, "int", cmd);
+
+      ValueArg<bool> share_embeddings("", "share_embeddings", "Share input and output embeddings. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+      ValueArg<int> output_embedding_dimension("", "output_embedding_dimension", "Number of output embedding dimensions. Default: 50.", false, 50, "int", cmd);
+      ValueArg<int> input_embedding_dimension("", "input_embedding_dimension", "Number of input embedding dimensions. Default: 50.", false, 50, "int", cmd);
+      ValueArg<int> embedding_dimension("", "embedding_dimension", "Number of input and output embedding dimensions. Default: none.", false, -1, "int", cmd);
+
+      ValueArg<int> vocab_size("", "vocab_size", "Vocabulary size. Default: auto.", false, 0, "int", cmd);
+      ValueArg<int> input_vocab_size("", "input_vocab_size", "Vocabulary size. Default: auto.", false, 0, "int", cmd);
+      ValueArg<int> output_vocab_size("", "output_vocab_size", "Vocabulary size. Default: auto.", false, 0, "int", cmd);
+      ValueArg<int> ngram_size("", "ngram_size", "Size of n-grams. Default: auto.", false, 0, "int", cmd);
+
+      ValueArg<string> model_prefix("", "model_prefix", "Prefix for output model files." , false, "", "string", cmd);
+      ValueArg<string> words_file("", "words_file", "Vocabulary." , false, "", "string", cmd);
+      ValueArg<string> input_words_file("", "input_words_file", "Vocabulary." , false, "", "string", cmd);
+      ValueArg<string> output_words_file("", "output_words_file", "Vocabulary." , false, "", "string", cmd);
+      ValueArg<string> validation_file("", "validation_file", "Validation data (one numberized example per line)." , false, "", "string", cmd);
+      ValueArg<string> train_file("", "train_file", "Training data (one numberized example per line)." , true, "", "string", cmd);
+
+      cmd.parse(argc, argv);
+
+      // define program parameters //
+      myParam.train_file = train_file.getValue();
+      myParam.validation_file = validation_file.getValue();
+      myParam.input_words_file = input_words_file.getValue();
+      myParam.output_words_file = output_words_file.getValue();
+      if (words_file.getValue() != "")
+	  myParam.input_words_file = myParam.output_words_file = words_file.getValue();
+
+      myParam.model_prefix = model_prefix.getValue();
+
+      myParam.ngram_size = ngram_size.getValue();
+      myParam.vocab_size = vocab_size.getValue();
+      myParam.input_vocab_size = input_vocab_size.getValue();
+      myParam.output_vocab_size = output_vocab_size.getValue();
+      if (vocab_size.getValue() >= 0)
+	  myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
+
+      myParam.num_hidden = num_hidden.getValue();
+      myParam.activation_function = activation_function.getValue();
+      myParam.loss_function = loss_function.getValue();
+
+      myParam.num_threads = num_threads.getValue();
+
+      myParam.num_noise_samples = num_noise_samples.getValue();
+
+      myParam.input_embedding_dimension = input_embedding_dimension.getValue();
+      myParam.output_embedding_dimension = output_embedding_dimension.getValue();
+      if (embedding_dimension.getValue() >= 0)
+	      myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
+
+      myParam.minibatch_size = minibatch_size.getValue();
+      myParam.validation_minibatch_size = validation_minibatch_size.getValue();
+      myParam.num_epochs= num_epochs.getValue();
+      myParam.learning_rate = learning_rate.getValue();
+      myParam.use_momentum = use_momentum.getValue();
+      myParam.share_embeddings = share_embeddings.getValue();
+      myParam.normalization = normalization.getValue();
+      myParam.initial_momentum = initial_momentum.getValue();
+      myParam.final_momentum = final_momentum.getValue();
+      myParam.L2_reg = L2_reg.getValue();
+      myParam.init_normal= init_normal.getValue();
+      myParam.init_range = init_range.getValue();
+      myParam.normalization_init = normalization_init.getValue();
+
+      cerr << "Command line: " << endl;
+      cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
+
+      const string sep(" Value: ");
+      cerr << train_file.getDescription() << sep << train_file.getValue() << endl;
+      cerr << validation_file.getDescription() << sep << validation_file.getValue() << endl;
+      cerr << input_words_file.getDescription() << sep << input_words_file.getValue() << endl;
+      cerr << output_words_file.getDescription() << sep << output_words_file.getValue() << endl;
+      cerr << model_prefix.getDescription() << sep << model_prefix.getValue() << endl;
+
+      cerr << ngram_size.getDescription() << sep << ngram_size.getValue() << endl;
+      cerr << input_vocab_size.getDescription() << sep << input_vocab_size.getValue() << endl;
+      cerr << output_vocab_size.getDescription() << sep << output_vocab_size.getValue() << endl;
+
+      if (embedding_dimension.getValue() >= 0)
+      {
+	  cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
+      }
+      else
+      {
+	  cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
+	  cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
+      }
+      cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl;
+      if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue())
+      {
+	  cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
+	  exit(1);
+      }
+
+      cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl;
+
+      if (string_to_activation_function(activation_function.getValue()) == InvalidFunction)
+      {
+	 cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
+	  exit(1);
+      }
+      cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl;
+
+      if (string_to_loss_function(loss_function.getValue()) == InvalidLoss)
+      {
+	 cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
+	  exit(1);
+      }
+      cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl;
+
+      cerr << init_normal.getDescription() << sep << init_normal.getValue() << endl;
+      cerr << init_range.getDescription() << sep << init_range.getValue() << endl;
+
+      cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl;
+      cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
+      if (myParam.validation_file != "")
+	  cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
+      cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl;
+      cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl;
+
+      cerr << num_noise_samples.getDescription() << sep << num_noise_samples.getValue() << endl;
+
+      cerr << normalization.getDescription() << sep << normalization.getValue() << endl;
+      if (myParam.normalization)
+	  cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
+
+      cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl;
+      if (myParam.use_momentum)
+      {
+	  cerr << initial_momentum.getDescription() << sep << initial_momentum.getValue() << endl;
+	  cerr << final_momentum.getDescription() << sep << final_momentum.getValue() << endl;
+      }
+
+      cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
+
+      if (unigram_probs_file.getValue() != "")
+      {
+	  cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
+      }
+    }
+    catch (TCLAP::ArgException &e)
+    {
+      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+      exit(1);
+    }
+
+    myParam.num_threads = setup_threads(myParam.num_threads);
+    int save_threads;
+
+    //unsigned seed = std::time(0);
+    unsigned seed = 1234; //for testing only
+    mt19937 rng(seed);
+
+    /////////////////////////READING IN THE TRAINING AND VALIDATION DATA///////////////////
+    /////////////////////////////////////////////////////////////////////////////////////
+
+    // Read training data
+    vector<int> training_data_flat;
+    readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size);
+    data_size_t training_data_size = training_data_flat.size() / myParam.ngram_size;
+    cerr << "Number of training instances: "<< training_data_size << endl;
+
+    Map< Matrix<int,Dynamic,Dynamic> > training_data(training_data_flat.data(), myParam.ngram_size, training_data_size);
+
+    // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index
+    if (myParam.input_vocab_size == 0 and myParam.input_words_file == "")
+    {
+        myParam.input_vocab_size = training_data.topRows(myParam.ngram_size-1).maxCoeff()+1;
+    }
+
+    // If neither --output_vocab_size nor --output_words_file is given, set output_vocab_size to the maximum word index
+    if (myParam.output_vocab_size == 0 and myParam.words_file == "")
+    {
+        myParam.output_vocab_size = training_data.row(myParam.ngram_size-1).maxCoeff()+1;
+    }
+
+    // Randomly shuffle training data to improve learning
+    for (data_size_t i=training_data_size-1; i>0; i--)
+    {
+        data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+	training_data.col(i).swap(training_data.col(j));
+    }
+
+    // Read validation data
+    vector<int> validation_data_flat;
+    int validation_data_size = 0;
+    
+    if (myParam.validation_file != "")
+    {
+	readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat);
+	validation_data_size = validation_data_flat.size() / myParam.ngram_size;
+	cerr << "Number of validation instances: " << validation_data_size << endl;
+    }
+
+    Map< Matrix<int,Dynamic,Dynamic> > validation_data(validation_data_flat.data(), myParam.ngram_size, validation_data_size);
+
+    ///// Read in vocabulary file. We don't actually use it; it just gets reproduced in the output file
+
+    vector<string> input_words;
+    if (myParam.input_words_file != "")
+    {
+        readWordsFile(myParam.input_words_file, input_words);
+	if (myParam.input_vocab_size == 0)
+	    myParam.input_vocab_size = input_words.size();
+    }
+
+    vector<string> output_words;
+    if (myParam.output_words_file != "")
+    {
+        readWordsFile(myParam.output_words_file, output_words);
+	if (myParam.output_vocab_size == 0)
+	    myParam.output_vocab_size = output_words.size();
+    }
+
+    ///// Construct unigram model and sampler that will be used for NCE
+
+    vector<data_size_t> unigram_counts(myParam.output_vocab_size);
+    for (data_size_t train_id=0; train_id < training_data_size; train_id++)
+    {
+        int output_word = training_data(myParam.ngram_size-1, train_id);
+	unigram_counts[output_word] += 1;
+    }
+    multinomial<data_size_t> unigram (unigram_counts);
+
+    ///// Create and initialize the neural network and associated propagators.
+
+    model nn(myParam.ngram_size,
+        myParam.input_vocab_size,
+        myParam.output_vocab_size,
+        myParam.input_embedding_dimension,
+	      myParam.num_hidden,
+        myParam.output_embedding_dimension,
+        myParam.share_embeddings);
+
+    nn.initialize(rng, myParam.init_normal, myParam.init_range, -log(myParam.output_vocab_size));
+    nn.set_activation_function(string_to_activation_function(myParam.activation_function));
+    loss_function_type loss_function = string_to_loss_function(myParam.loss_function);
+
+    propagator prop(nn, myParam.minibatch_size);
+    propagator prop_validation(nn, myParam.validation_minibatch_size);
+    SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram);
+    // normalization parameters
+    vector_map c_h, c_h_running_gradient;
+    
+    ///////////////////////TRAINING THE NEURAL NETWORK////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////
+
+    data_size_t num_batches = (training_data_size-1)/myParam.minibatch_size + 1;
+    cerr<<"Number of training minibatches: "<<num_batches<<endl;
+
+    int num_validation_batches = 0;
+    if (validation_data_size > 0)
+    {
+        num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1;
+	cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
+    } 
+
+    double current_momentum = myParam.initial_momentum;
+    double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1);
+    double current_learning_rate = myParam.learning_rate;
+    double current_validation_ll = 0.0;
+
+    int ngram_size = myParam.ngram_size;
+    int input_vocab_size = myParam.input_vocab_size;
+    int output_vocab_size = myParam.output_vocab_size;
+    int minibatch_size = myParam.minibatch_size;
+    int validation_minibatch_size = myParam.validation_minibatch_size;
+    int num_noise_samples = myParam.num_noise_samples;
+
+    if (myParam.normalization)
+    {
+	for (data_size_t i=0;i<training_data_size;i++)
+	{
+	    Matrix<int,Dynamic,1> context = training_data.block(0,i,ngram_size-1,1);
+	    if (c_h.find(context) == c_h.end())
+	    {
+	        c_h[context] = -myParam.normalization_init;
+	    }
+	}
+    }
+
+    for (int epoch=0; epoch<myParam.num_epochs; epoch++)
+    { 
+        cerr << "Epoch " << epoch+1 << endl;
+        cerr << "Current learning rate: " << current_learning_rate << endl;
+
+        if (myParam.use_momentum) 
+	    cerr << "Current momentum: " << current_momentum << endl;
+	else
+            current_momentum = -1;
+
+	cerr << "Training minibatches: ";
+
+	double log_likelihood = 0.0;
+
+	int num_samples = 0;
+	if (loss_function == LogLoss)
+	    num_samples = output_vocab_size;
+	else if (loss_function == NCELoss)
+	    num_samples = 1+num_noise_samples;
+
+	Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
+	Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
+	Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
+	Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
+
+        for(data_size_t batch=0;batch<num_batches;batch++)
+        {
+            if (batch > 0 && batch % 10000 == 0)
+            {
+	        cerr << batch <<"...";
+            } 
+
+            data_size_t minibatch_start_index = minibatch_size * batch;
+            int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index);
+	    Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+
+            double adjusted_learning_rate = current_learning_rate/current_minibatch_size;
+            //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl;
+
+            /*
+            if (batch == rand() % num_batches)
+            {
+                cerr<<"we are checking the gradient in batch "<<batch<<endl;
+                /////////////////////////CHECKING GRADIENTS////////////////////////////////////////
+                gradientChecking(myParam,minibatch_start_index,current_minibatch_size,word_nodes,context_nodes,hidden_layer_node,hidden_layer_to_output_node,
+                              shuffled_training_data,c_h,unif_real_vector,eng_real_vector,unif_int_vector,eng_int_vector,unigram_probs_vector,
+                              q_vector,J_vector,D_prime);
+            }
+            */
+
+            ///// Forward propagation
+
+            prop.fProp(minibatch.topRows(ngram_size-1));
+
+	    if (loss_function == NCELoss)
+	    {
+	        ///// Noise-contrastive estimation
+
+	        // Generate noise samples. Gather positive and negative samples into matrix.
+
+	        start_timer(3);
+
+		minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1);
+		
+		for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++)
+		    for (int train_id = 0; train_id < current_minibatch_size; train_id++)
+		        minibatch_samples(sample_id, train_id) = unigram.sample(rng);
+	    
+		stop_timer(3);
+
+		// Final forward propagation step (sparse)
+		start_timer(4);
+		prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix,
+						    minibatch_samples, scores);
+		stop_timer(4);
+
+		// Apply normalization parameters
+		if (myParam.normalization)
+		{
+		    for (int train_id = 0;train_id < current_minibatch_size;train_id++)
+		    {
+			Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1);
+			scores.col(train_id).array() += c_h[context];
+		    }
+		}
+
+		double minibatch_log_likelihood;
+		start_timer(5);
+		softmax_loss.fProp(scores.leftCols(current_minibatch_size), 
+				   minibatch_samples,
+				   probs, minibatch_log_likelihood);
+		stop_timer(5);
+		log_likelihood += minibatch_log_likelihood;
+
+		///// Backward propagation
+
+		start_timer(6);
+		softmax_loss.bProp(probs, minibatch_weights);
+		stop_timer(6);
+		
+		// Update the normalization parameters
+		
+		if (myParam.normalization)
+		{
+		    for (int train_id = 0;train_id < current_minibatch_size;train_id++)
+		    {
+			Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1);
+			c_h[context] += adjusted_learning_rate * minibatch_weights.col(train_id).sum();
+		    }
+		}
+
+		// Be careful of short minibatch
+		prop.bProp(minibatch.topRows(ngram_size-1),
+			   minibatch_samples.leftCols(current_minibatch_size), 
+			   minibatch_weights.leftCols(current_minibatch_size),
+			   adjusted_learning_rate, current_momentum, myParam.L2_reg);
+	    }
+	    else if (loss_function == LogLoss)
+	    {
+	        ///// Standard log-likelihood
+	        start_timer(4);
+		prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+		stop_timer(4);
+
+		double minibatch_log_likelihood;
+		start_timer(5);
+		SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
+				       minibatch.row(ngram_size-1), 
+				       probs, 
+				       minibatch_log_likelihood);
+		stop_timer(5);
+		log_likelihood += minibatch_log_likelihood;
+
+		///// Backward propagation
+		
+		start_timer(6);
+		SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), 
+				       probs.leftCols(current_minibatch_size), 
+				       minibatch_weights);
+		stop_timer(6);
+		
+		prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size),
+			   minibatch_weights,
+			   adjusted_learning_rate, current_momentum, myParam.L2_reg);
+	    }
+        }
+	cerr << "done." << endl;
+
+	if (loss_function == LogLoss)
+	{
+	    cerr << "Training log-likelihood: " << log_likelihood << endl;
+            cerr << "         perplexity:     "<< exp(-log_likelihood/training_data_size) << endl;
+	}
+	else if (loss_function == NCELoss)
+	    cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
+
+        current_momentum += momentum_delta;
+
+	#ifdef USE_CHRONO
+	cerr << "Propagation times:";
+	for (int i=0; i<timer.size(); i++)
+	  cerr << " " << timer.get(i);
+	cerr << endl;
+	#endif
+
+	if (myParam.model_prefix != "")
+	{
+	    cerr << "Writing model" << endl;
+	    if (myParam.input_words_file != "")
+	        nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
+	    else
+	        nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
+	}
+
+        if (epoch % 1 == 0 && validation_data_size > 0)
+        {
+            //////COMPUTING VALIDATION SET PERPLEXITY///////////////////////
+            ////////////////////////////////////////////////////////////////
+
+            double log_likelihood = 0.0;
+
+	    Matrix<double,Dynamic,Dynamic> scores(output_vocab_size, validation_minibatch_size);
+	    Matrix<double,Dynamic,Dynamic> output_probs(output_vocab_size, validation_minibatch_size);
+	    Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, validation_minibatch_size);
+
+            for (int validation_batch =0;validation_batch < num_validation_batches;validation_batch++)
+            {
+                int validation_minibatch_start_index = validation_minibatch_size * validation_batch;
+		int current_minibatch_size = min(validation_minibatch_size,
+						 validation_data_size - validation_minibatch_start_index);
+		minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, 
+											current_minibatch_size);
+		prop_validation.fProp(minibatch.topRows(ngram_size-1));
+
+		// Do full forward prop through output word embedding layer
+		start_timer(4);
+		prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores);
+		stop_timer(4);
+
+		// And softmax and loss. Be careful of short minibatch
+		double minibatch_log_likelihood;
+		start_timer(5);
+		SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
+				       minibatch.row(ngram_size-1),
+				       output_probs,
+				       minibatch_log_likelihood);
+		stop_timer(5);
+		log_likelihood += minibatch_log_likelihood;
+	    }
+
+            cerr << "Validation log-likelihood: "<< log_likelihood << endl;
+            cerr << "           perplexity:     "<< exp(-log_likelihood/validation_data_size) << endl;
+
+	    // If the validation perplexity decreases, halve the learning rate.
+            if (epoch > 0 && log_likelihood < current_validation_ll)
+            { 
+                current_learning_rate /= 2;
+            }
+            current_validation_ll = log_likelihood;
+	}
+
+    }
+    return 0;
+}
diff --git a/src/util.cpp b/src/util.cpp
new file mode 100644
index 0000000..fe022c9
--- /dev/null
+++ b/src/util.cpp
@@ -0,0 +1,213 @@
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cmath>
+
+#include <boost/unordered_map.hpp> 
+#include <boost/algorithm/string.hpp>
+
+#include "maybe_omp.h"
+#ifdef EIGEN_USE_MKL_ALL
+#include <mkl.h>
+#endif
+
+#include "util.h"
+
+extern double drand48();
+
+using namespace Eigen;
+using namespace std;
+using namespace boost::random;
+
+namespace nplm
+{
+
+void splitBySpace(const std::string &line, std::vector<std::string> &items)
+{
+    string copy(line);
+    boost::trim_if(copy, boost::is_any_of(" \t"));
+    if (copy == "")
+    {
+	items.clear();
+	return;
+    }
+    boost::split(items, copy, boost::is_any_of(" \t"), boost::token_compress_on);
+}
+
+void readWordsFile(ifstream &TRAININ, vector<string> &word_list)
+{
+  string line;
+  while (getline(TRAININ, line) && line != "")
+  {
+    vector<string> words;
+    splitBySpace(line, words);
+    if (words.size() != 1)
+    {
+        cerr << "Error: vocabulary file must have only one word per line" << endl;
+        exit(-1);
+    }
+    word_list.push_back(words[0]);
+  }
+}
+
+void readWordsFile(const string &file, vector<string> &word_list)
+{
+  cerr << "Reading word list from: " << file<< endl;
+
+  ifstream TRAININ;
+  TRAININ.open(file.c_str());
+  if (! TRAININ)
+  {
+    cerr << "Error: can't read word list from file " << file<< endl;
+    exit(-1);
+  }
+
+  readWordsFile(TRAININ, word_list);
+  TRAININ.close();
+}
+
+void writeWordsFile(const vector<string> &words, ofstream &file)
+{
+    for (int i=0; i<words.size(); i++)
+    {
+	file << words[i] << endl;
+    }
+}
+
+void writeWordsFile(const vector<string> &words, const string &filename)
+{
+    ofstream OUT;
+    OUT.open(filename.c_str());
+    if (! OUT)
+    {
+      cerr << "Error: can't write to file " << filename << endl;
+      exit(-1);
+    }
+    writeWordsFile(words, OUT);
+    OUT.close();
+}
+
+void readSentFile(const string &file, vector<vector<string> > &sentences)
+{
+  cerr << "Reading sentences from: " << file << endl;
+
+  ifstream TRAININ;
+  TRAININ.open(file.c_str());
+  if (! TRAININ)
+  {
+    cerr << "Error: can't read from file " << file<< endl;
+    exit(-1);
+  }
+
+  string line;
+  while (getline(TRAININ, line))
+  {
+    vector<string> words;
+    splitBySpace(line, words);
+    sentences.push_back(words);
+  }
+
+  TRAININ.close();
+}
+
+// Read a data file of unknown size into a flat vector<int>.
+// If this takes too much memory, we should create a vector of minibatches.
+void readDataFile(const string &filename, int &ngram_size, vector<int> &data, int minibatch_size)
+{
+  cerr << "Reading minibatches from file " << filename << ": ";
+
+  ifstream DATAIN(filename.c_str());
+  if (!DATAIN)
+  {
+    cerr << "Error: can't read data from file " << filename<< endl;
+    exit(-1);
+  }
+
+  vector<int> data_vector;
+
+  string line;
+  long long int n_lines = 0;
+  while (getline(DATAIN, line))
+  {
+    vector<string> ngram;
+    splitBySpace(line, ngram);
+
+    if (ngram_size == 0)
+        ngram_size = ngram.size();
+
+    if (ngram.size() != ngram_size)
+    {
+        cerr << "Error: expected " << ngram_size << " fields in instance, found " << ngram.size() << endl;
+	exit(-1);
+    }
+
+    for (int i=0;i<ngram_size;i++)
+        data.push_back(boost::lexical_cast<int>(ngram[i]));
+
+    n_lines++;
+    if (minibatch_size && n_lines % (minibatch_size * 10000) == 0)
+      cerr << n_lines/minibatch_size << "...";
+  }
+  cerr << "done." << endl;
+  DATAIN.close();
+}
+
+double logadd(double x, double y)
+{
+    if (x > y)
+        return x + log1p(std::exp(y-x));
+    else
+        return y + log1p(std::exp(x-y));
+}
+
+#ifdef USE_CHRONO
+void Timer::start(int i)
+{
+    m_start[i] = clock_type::now();
+}
+
+void Timer::stop(int i)
+{
+    m_total[i] += clock_type::now() - m_start[i];
+}
+
+void Timer::reset(int i) { m_total[i] = duration_type(); }
+
+double Timer::get(int i) const
+{
+    return boost::chrono::duration<double>(m_total[i]).count();
+}
+
+Timer timer(20);
+#endif
+
+int setup_threads(int n_threads)
+{
+    #ifdef _OPENMP
+    if (n_threads)
+        omp_set_num_threads(n_threads);
+    n_threads = omp_get_max_threads();
+    if (n_threads > 1)
+        cerr << "Using " << n_threads << " threads" << endl;
+
+    Eigen::initParallel();
+    Eigen::setNbThreads(n_threads);
+
+    #ifdef __INTEL_MKL__
+    /*
+    // Set the threading layer to match the compiler.
+    // This lets MKL automatically go single-threaded in parallel regions.
+    #ifdef __INTEL_COMPILER
+    mkl_set_threading_layer(MKL_THREADING_INTEL);
+    #elif defined __GNUC__
+    mkl_set_threading_layer(MKL_THREADING_GNU);
+    #endif
+    */
+    mkl_set_num_threads(n_threads);
+    #endif
+    #endif
+
+    return n_threads;
+}
+
+} // namespace nplm
diff --git a/src/util.h b/src/util.h
new file mode 100644
index 0000000..c774a72
--- /dev/null
+++ b/src/util.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include <boost/random/normal_distribution.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/functional/hash.hpp>
+#ifdef USE_CHRONO
+#include <boost/chrono.hpp>
+#endif
+
+#include <Eigen/Dense>
+
+#include "maybe_omp.h"
+
+// Make matrices hashable
+
+namespace Eigen {
+    template <typename Derived>
+    size_t hash_value(const DenseBase<Derived> &m)
+    {
+        size_t h=0;
+	for (int i=0; i<m.rows(); i++)
+	    for (int j=0; j<m.cols(); j++)
+	        boost::hash_combine(h, m(i,j));
+	return h;
+    }
+}
+
+namespace nplm
+{
+
+void splitBySpace(const std::string &line, std::vector<std::string> &items);
+void readWordsFile(std::ifstream &TRAININ, std::vector<std::string> &word_list);
+void readWordsFile(const std::string &file, std::vector<std::string> &word_list);
+void writeWordsFile(const std::vector<std::string> &words, std::ofstream &file);
+void writeWordsFile(const std::vector<std::string> &words, const std::string &filename);
+void readDataFile(const std::string &filename, int &ngram_size, std::vector<int> &data, int minibatch_size=0);
+void readUnigramProbs(const std::string &unigram_probs_file, std::vector<double> &unigram_probs);
+void readSentFile(const std::string &file, std::vector<std::vector<std::string> > &sentences);
+
+// Functions that take non-const matrices as arguments
+// are supposed to declare them const and then use this
+// to cast away constness.
+#define UNCONST(t,c,uc) Eigen::MatrixBase<t> &uc = const_cast<Eigen::MatrixBase<t>&>(c);
+
+template <typename Derived>
+void initMatrix(boost::random::mt19937 &engine,
+		const Eigen::MatrixBase<Derived> &p_const,
+		bool init_normal, double range)
+{
+    UNCONST(Derived, p_const, p);
+    if (init_normal == 0)
+     // initialize with uniform distribution in [-range, range]
+    {
+        boost::random::uniform_real_distribution<> unif_real(-range, range); 
+        for (int i = 0; i < p.rows(); i++)
+        {
+            for (int j = 0; j< p.cols(); j++)
+            {
+                p(i,j) = unif_real(engine);    
+            }
+        }
+
+    }
+    else 
+      // initialize with gaussian distribution with mean 0 and stdev range
+    {
+        boost::random::normal_distribution<double> unif_normal(0., range);
+        for (int i = 0; i < p.rows(); i++)
+        {
+            for (int j = 0; j < p.cols(); j++)
+            {
+                p(i,j) = unif_normal(engine);    
+            }
+        }
+    }
+}
+
+template <typename Derived>
+void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> &param_const)
+{
+    UNCONST(Derived, param_const, param);
+
+    int i = 0;
+    std::string line;
+    std::vector<std::string> fields;
+    
+    while (std::getline(TRAININ, line) && line != "")
+    {
+        splitBySpace(line, fields);
+	if (fields.size() != param.cols())
+	{
+	    std::ostringstream err;
+	    err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
+	    throw std::runtime_error(err.str());
+	}
+	
+	if (i >= param.rows())
+	{
+	    std::ostringstream err;
+	    err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
+	    throw std::runtime_error(err.str());
+	}
+	
+	for (int j=0; j<fields.size(); j++)
+	{
+	    param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
+	}
+	i++;
+    }
+    
+    if (i != param.rows())
+    {
+        std::ostringstream err;
+	err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
+	throw std::runtime_error(err.str());
+    }
+}
+
+template <typename Derived>
+void readMatrix(const std::string &param_file, const Eigen::MatrixBase<Derived> &param_const)
+{
+    UNCONST(Derived, param_const, param);
+    std::cerr << "Reading data from file: " << param_file << std::endl;
+    
+    std::ifstream TRAININ(param_file.c_str());
+    if (!TRAININ)
+    {
+        std::cerr << "Error: can't read training data from file " << param_file << std::endl;
+	exit(-1);
+    }
+    readMatrix(TRAININ, param);
+    TRAININ.close();
+}
+
+template <typename Derived>
+void writeMatrix(const Eigen::MatrixBase<Derived> &param, const std::string &filename)
+{
+    std::cerr << "Writing parameters to " << filename << std::endl;
+
+    std::ofstream OUT;
+    OUT.precision(16);
+    OUT.open(filename.c_str());
+    if (! OUT)
+    {
+      std::cerr << "Error: can't write to file " << filename<< std::endl;
+      exit(-1);
+    }
+    writeMatrix(param, OUT);
+    OUT.close();
+}
+
+template <typename Derived>
+void writeMatrix(const Eigen::MatrixBase<Derived> &param, std::ofstream &OUT)
+{
+    for (int row = 0;row < param.rows();row++)
+    {
+        int col;
+        for (col = 0;col < param.cols()-1;col++)
+        {
+            OUT<<param(row,col)<<"\t";
+        }
+        //dont want an extra tab at the end
+        OUT<<param(row,col)<<std::endl;
+    }
+}
+
+template <typename Derived>
+double logsum(const Eigen::MatrixBase<Derived> &v)
+{
+    int mi; 
+    double m = v.maxCoeff(&mi);
+    double logz = 0.0;
+    for (int i=0; i<v.rows(); i++)
+        if (i != mi)
+	    logz += std::exp(v(i) - m);
+    logz = log1p(logz) + m;
+    return logz;
+}
+
+double logadd(double x, double y);
+
+#ifdef USE_CHRONO
+class Timer 
+{
+    typedef boost::chrono::high_resolution_clock clock_type;
+    typedef clock_type::time_point time_type;
+    typedef clock_type::duration duration_type;
+    std::vector<time_type> m_start;
+    std::vector<duration_type> m_total;
+public:
+    Timer() { }
+    Timer(int n) { resize(n); }
+    void resize(int n) { m_start.resize(n); m_total.resize(n); }
+    int size() const { return m_start.size(); }
+    void start(int i);
+    void stop(int i);
+    void reset(int i);
+    double get(int i) const;
+};
+
+extern Timer timer;
+#define start_timer(x) timer.start(x)
+#define stop_timer(x) timer.stop(x)
+#else
+#define start_timer(x) 0
+#define stop_timer(x) 0
+#endif
+
+int setup_threads(int n_threads);
+
+} // namespace nplm
diff --git a/src/vocabulary.h b/src/vocabulary.h
new file mode 100644
index 0000000..fee76f6
--- /dev/null
+++ b/src/vocabulary.h
@@ -0,0 +1,84 @@
+#ifndef VOCABULARY_H
+#define VOCABULARY_H
+
+#include <vector>
+#include <string>
+#include <queue>
+#include <boost/unordered_map.hpp>
+
+namespace nplm
+{
+
+template <typename T>
+struct compare_second
+{
+  bool operator()(const T &lhs, const T &rhs) const { return lhs.second < rhs.second; }
+};
+
+class vocabulary {
+    std::vector<std::string> m_words;
+    boost::unordered_map<std::string, int> m_index;
+    int unk;
+public:
+    vocabulary() 
+    { 
+        unk = insert_word("<unk>");
+    }
+
+    vocabulary(const std::vector<std::string> &words)
+      :
+      m_words(words)
+    {
+        for (int i=0; i<words.size(); i++)
+            m_index[words[i]] = i;
+	unk = m_index["<unk>"];
+    }
+
+    int lookup_word(const std::string &word) const
+    {
+        boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
+	if (pos != m_index.end())
+	    return pos->second;
+	else
+	  return unk;
+    }
+
+    int insert_word(const std::string &word)
+    {
+        int i = size();
+        bool inserted = m_index.insert(make_pair(word, i)).second;
+	if (inserted)
+	{
+	    m_words.push_back(word);
+	}
+	return i;
+    }
+
+    int size() const { return m_words.size(); }
+
+    // Inserts the most-frequent words from counts until vocab_size words are reached.
+    // counts is a collection of pair<string,int>
+    template <typename Map>
+    int insert_most_frequent(const Map &counts, int vocab_size)
+    {
+        typedef std::pair<std::string,int> stringint;
+
+	std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> > 
+	  q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end()));
+
+	int inserted = 0;
+	while (size() < vocab_size && !q.empty())
+	{
+	    insert_word(q.top().first);
+	    q.pop();
+	    inserted++;
+	}
+	return inserted;
+    }
+
+    const std::vector<std::string> &words() const { return m_words; }
+};
+
+} // namespace nplm
+
+#endif
author	Kenneth Heafield <github@kheafield.com>	2013-10-29 22:00:37 +0400
committer	Kenneth Heafield <github@kheafield.com>	2013-10-29 22:00:37 +0400
commit	78eecfdd7ef4cc0aef575c828c6fef747c63da19 (patch)
tree	cbd1e84c871306a35e1352286f7749ccac4f60bc /src
parent	e4138ba17732e70bfe9ad8e806173c083a9ddd0e (diff)