diff options
author | Kenneth Heafield <github@kheafield.com> | 2013-10-29 22:00:37 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2013-10-29 22:00:37 +0400 |
commit | 78eecfdd7ef4cc0aef575c828c6fef747c63da19 (patch) | |
tree | cbd1e84c871306a35e1352286f7749ccac4f60bc /src | |
parent | e4138ba17732e70bfe9ad8e806173c083a9ddd0e (diff) |
Copy nplm-0.1 after removing some executable bits
Diffstat (limited to 'src')
-rw-r--r-- | src/Activation_function.h | 119 | ||||
-rw-r--r-- | src/Makefile | 175 | ||||
-rw-r--r-- | src/SoftmaxLoss.h | 136 | ||||
-rw-r--r-- | src/USCMatrix.h | 192 | ||||
-rw-r--r-- | src/graphClasses.h | 60 | ||||
-rw-r--r-- | src/maybe_omp.h | 13 | ||||
-rw-r--r-- | src/model.cpp | 246 | ||||
-rw-r--r-- | src/model.h | 105 | ||||
-rw-r--r-- | src/multinomial.h | 135 | ||||
-rw-r--r-- | src/neuralClasses.h | 520 | ||||
-rw-r--r-- | src/neuralLM.cpp | 1 | ||||
-rw-r--r-- | src/neuralLM.h | 350 | ||||
-rw-r--r-- | src/param.h | 58 | ||||
-rw-r--r-- | src/prepareNeuralLM.cpp | 246 | ||||
-rw-r--r-- | src/prepareNeuralTM.cpp | 396 | ||||
-rw-r--r-- | src/propagator.h | 194 | ||||
-rw-r--r-- | src/python/nplm.pxd | 23 | ||||
-rw-r--r-- | src/python/nplm.pyx | 38 | ||||
-rwxr-xr-x | src/shared/.gitignore | 0 | ||||
-rw-r--r-- | src/testNeuralLM.cpp | 164 | ||||
-rw-r--r-- | src/testNeuralNetwork.cpp | 119 | ||||
-rw-r--r-- | src/trainNeuralNetwork.cpp | 584 | ||||
-rw-r--r-- | src/util.cpp | 213 | ||||
-rw-r--r-- | src/util.h | 219 | ||||
-rw-r--r-- | src/vocabulary.h | 84 |
25 files changed, 4390 insertions, 0 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h new file mode 100644 index 0000000..eacba14 --- /dev/null +++ b/src/Activation_function.h @@ -0,0 +1,119 @@ +#ifndef ACTIVATION_FUNCTION_H +#define ACTIVATION_FUNCTION_H + +#include <cmath> +#include <string> +#include <Eigen/Dense> + +#include "util.h" + +namespace nplm +{ + +// is this cheating? +using Eigen::Matrix; +using Eigen::MatrixBase; + +enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunction }; + +inline activation_function_type string_to_activation_function (const std::string &s) +{ + if (s == "identity") + return Identity; + else if (s == "rectifier") + return Rectifier; + else if (s == "tanh") + return Tanh; + else if (s == "hardtanh") + return HardTanh; + else + return InvalidFunction; +} + +inline std::string activation_function_to_string (activation_function_type f) +{ + if (f == Identity) + return "identity"; + else if (f == Rectifier) + return "rectifier"; + else if (f == Tanh) + return "tanh"; + else if (f == HardTanh) + return "hardtanh"; +} + +struct hardtanh_functor { + double operator() (double x) const { if (x < -1.) return -1.; else if (x > 1.) return 1.; else return x; } +}; + +struct dhardtanh_functor { + double operator() (double x) const { return x > -1. && x < 1. ? 1. : 0.; } +}; + +struct tanh_functor { + double operator() (double x) const { return std::tanh(x); } +}; + +struct dtanh_functor { + double operator() (double x) const { return 1-x*x; } +}; + +struct rectifier_functor { + double operator() (double x) const { return std::max(x, 0.); } +}; + +struct drectifier_functor { + double operator() (double x) const { return x > 0. ? 1. : 0.; } +}; + +class Activation_function +{ + private: + int size; + activation_function_type f; + + public: + Activation_function() : size(0), f(Rectifier) { } + + void resize(int size) { this->size = size; } + void set_activation_function(activation_function_type f) { this->f = f; } + + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range) { } + + int n_inputs () const { return size; } + int n_outputs () const { return size; } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; + case Tanh: my_output = input.unaryExpr(tanh_functor()); break; + case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; + } + } + + template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut> + void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output, + const MatrixBase<DerivedIn> &finput, const MatrixBase<DerivedOut> &foutput) const + { + UNCONST(DerivedGIn, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; + case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; + case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; + } + } +}; + +} // namespace nplm + +#endif diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..3b34fe9 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,175 @@ +### Compilation options. + +# C++ compiler. Tested with g++ and Intel icpc. +CXX=g++ +#CXX=icpc + +# Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance! +#CFLAGS=-g +CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG + +# Architecture. Set to x86_64 or i686 to override. +ARCH:=$(shell uname -m) +# Operating system. Set to override (the only option that makes any difference is Darwin). +OS:=$(shell uname -s) + +# To build static binaries, uncomment the line below: +#STATIC=1 + +### Required libraries. You must install these prior to building. + +# Set this to the root directory of Boost (should have a subdirectory named boost): +BOOST=/usr/usc/boost/1.51.0 +#BOOST=/usr +#BOOST=/opt/local +# Where to find Boost header files +BOOST_INC=$(BOOST)/include + +# Set this to the root directory of Eigen (should have a subdirectory named Eigen): +EIGEN=../3rdparty/eigen + +### Optional libraries. + +# To disable multithreading, comment out the line below: +OMP=1 + +# To use the MKL library, uncomment the line below and set it to the MKL root: +MKL=/usr/usc/intel/12.1.1/mkl + +# For Python bindings, set the following and run 'make python/nplm.so'. +PYTHON_VERSION=2.7 +#PYTHON_ROOT=/opt/local/Library/Frameworks/Python.framework/Versions/$(PYTHON_VERSION) +PYTHON_ROOT=/home/nlg-01/chiangd/pkg64/python +CYTHON=$(PYTHON_ROOT)/bin/cython + +##### End of configurable options ##### + +# used for profiling +#USE_CHRONO=1 + +TCLAP=../3rdparty/tclap/include + +# Currently, this is needed only if USE_CHRONO is defined: +# Where to find Boost libraries +BOOST_LIB=$(BOOST)/lib +# On some systems, a suffix is appended for the multithreaded version. +BOOST_LIB_SUFFIX= +#BOOST_LIB_SUFFIX=-mt + +BOOST_CFLAGS=-I$(BOOST_INC) +BOOST_LDFLAGS= +ifdef USE_CHRONO + BOOST_CFLAGS+=-DUSE_CHRONO + BOOST_LDLIBS+=-lboost_system$(BOOST_LIB_SUFFIX) -lboost_chrono$(BOOST_LIB_SUFFIX) +endif +ifdef BOOST_LDLIBS + BOOST_LDFLAGS+=-L$(BOOST_LIB) -Wl,-rpath -Wl,$(BOOST_LIB) +endif + +ifdef OMP + ifneq (,$(findstring g++,$(CXX))) + OMP_CFLAGS=-fopenmp + OMP_LDFLAGS=-fopenmp + endif + ifneq (,$(findstring icpc,$(CXX))) + OMP_CFLAGS=-openmp + OMP_LDFLAGS=-openmp + endif +endif + +ifdef MKL + MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL + MKL_LDLIBS=-Wl,--start-group + ifeq ($(ARCH),x86_64) + MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64 + MKL_LDLIBS+=-lmkl_intel_lp64 + endif + ifeq ($(ARCH),i686) + MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32 + MKL_LDLIBS+=-lmkl_intel + endif + + ifneq (,$(findstring g++,$(CXX))) + MKL_LDLIBS+=-lmkl_gnu_thread + endif + ifneq (,$(findstring icpc,$(CXX))) + MKL_LDLIBS+=-lmkl_intel_thread + endif + + #MKL_LDLIBS=-lmkl_rt + MKL_LDLIBS+=-lmkl_core -Wl,--end-group +endif + +ifdef STATIC + LDFLAGS+=-static +endif + +ALL_CFLAGS=$(OMP_CFLAGS) $(MKL_CFLAGS) $(BOOST_CFLAGS) -I$(TCLAP) -I$(EIGEN) $(CFLAGS) +ALL_LDFLAGS=$(OMP_LDFLAGS) $(MKL_LDFLAGS) $(BOOST_LDFLAGS) $(LDFLAGS) +ALL_LDLIBS=$(MKL_LDLIBS) $(BOOST_LDLIBS) + +PYTHON_CFLAGS+=-I$(PYTHON_ROOT)/include/python$(PYTHON_VERSION) +ifeq ($(OS),Darwin) + # avoid having to link in libpython + PYTHON_LDFLAGS+=-undefined dynamic_lookup +endif + +# Some other programs + +AR=ar +RANLIB=ranlib + +# Rules + +BINS=trainNeuralNetwork testNeuralNetwork prepareNeuralLM testNeuralLM prepareNeuralTM +LIBS=neuralLM.a neuralLM.so +OBJS=util.o model.o + +all: $(BINS) $(LIBS) + +clean: + rm -f *.o shared/*.o python/*.o $(BINS) $(LIBS) python/nplm.{cpp,so} + +install: all + mkdir -p ../bin + cp $(BINS) ../bin + mkdir -p ../lib + cp $(LIBS) ../lib + +%.o: %.cpp + $(CXX) -c $(ALL_CFLAGS) $< -o $@ + +shared/%.o: %.cpp + $(CXX) -c -fPIC $(ALL_CFLAGS) $< -o $@ + +trainNeuralNetwork: trainNeuralNetwork.o $(OBJS) + $(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ + +testNeuralNetwork: testNeuralNetwork.o $(OBJS) + $(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ + +prepareNeuralLM: prepareNeuralLM.o $(OBJS) + $(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ + +testNeuralLM: testNeuralLM.o $(OBJS) + $(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ + +prepareNeuralTM: prepareNeuralTM.o $(OBJS) + $(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ + +neuralLM.a: neuralLM.o $(OBJS) + rm -f $@ + $(AR) rv $@ $^ + $(RANLIB) $@ + +neuralLM.so: $(addprefix shared/,neuralLM.o $(OBJS)) + $(CXX) -shared $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ + +python/nplm.cpp: python/nplm.pyx + $(CYTHON) --cplus $^ + +python/nplm.o: python/nplm.cpp + $(CXX) -c -fPIC -I. $(ALL_CFLAGS) $(PYTHON_CFLAGS) $< -o $@ + +python/nplm.so: python/nplm.o $(addprefix shared/,neuralLM.o $(OBJS)) + $(CXX) -shared $(ALL_LDFLAGS) $(PYTHON_LDFLAGS) $^ $(ALL_LDLIBS) $(PYTHON_LDLIBS) -o $@ diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h new file mode 100644 index 0000000..77d94ca --- /dev/null +++ b/src/SoftmaxLoss.h @@ -0,0 +1,136 @@ +#ifndef SOFTMAXLOSS_H +#define SOFTMAXLOSS_H + +#include <Eigen/Dense> +#include "multinomial.h" +#include "util.h" + +namespace nplm +{ + +// is this cheating? +using Eigen::Matrix; +using Eigen::MatrixBase; +using Eigen::Dynamic; + +///// Softmax layer plus log-loss function. + +enum loss_function_type { LogLoss, NCELoss, InvalidLoss }; + +inline loss_function_type string_to_loss_function (const std::string &s) +{ + if (s == "log") + return LogLoss; + else if (s == "nce") + return NCELoss; + else + return InvalidLoss; +} + +inline std::string loss_function_to_string (loss_function_type f) +{ + if (f == LogLoss) + return "log"; + else if (f == NCELoss) + return "nce"; +} + +/// Note: Outputs log-probabilities. + +struct SoftmaxLogLoss +{ + template <typename DerivedI, typename DerivedW, typename DerivedO> + void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + + double log_likelihood = 0.0; + + #pragma omp parallel for reduction(+:log_likelihood) + for (int train_id = 0; train_id < input.cols(); train_id++) + { + double normalization = logsum(input.col(train_id)); + output.col(train_id).array() = input.col(train_id).array() - normalization; + log_likelihood += output(output_words(train_id), train_id); + } + loss = log_likelihood; + } + + template <typename DerivedW, typename DerivedO, typename DerivedI> + void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const) + { + UNCONST(DerivedI, grad_input_const, grad_input); + grad_input.setZero(); + #pragma omp parallel for + for (int train_id = 0; train_id < output.cols(); train_id++) + { + grad_input(output_words(train_id), train_id) += 1.; + grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); + } + } +}; + +///// Softmax layer plus NCE loss function. + +///// Note: Outputs probabilities. + +///// Note: Unlike SoftmaxLogLoss, does not compute *or* apply precomputed +///// normalizations. Currently the caller is expected to do normalization. + +template <typename Multinomial> +class SoftmaxNCELoss +{ + const Multinomial &unigram; + +public: + SoftmaxNCELoss(const Multinomial &unigram) + : unigram(unigram) + { + } + + template <typename DerivedI, typename DerivedW, typename DerivedO> + void fProp(const MatrixBase<DerivedI> &scores, + const MatrixBase<DerivedW> &minibatch_samples, + const MatrixBase<DerivedO> &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + double log_likelihood = 0.0; + int num_noise_samples = minibatch_samples.rows()-1; + double log_num_noise_samples = std::log(num_noise_samples); + #pragma omp parallel for reduction(+:log_likelihood) schedule(static) + for (int train_id = 0; train_id < scores.cols(); train_id++) + { + for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) + { + int sample = minibatch_samples(sample_id, train_id); + // To avoid zero or infinite probabilities, + // never take exp of score without normalizing first, + // even if it's a little slower... + double score = scores(sample_id, train_id); + double score_noise = log_num_noise_samples + unigram.logprob(sample); + double z = logadd(score, score_noise); + double logprob = score - z; + double logprob_noise = score_noise - z; + output(sample_id, train_id) = std::exp(logprob); + log_likelihood += sample_id == 0 ? logprob : logprob_noise; + } + } + loss = log_likelihood; + } + + template <typename DerivedO, typename DerivedI> + void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const) + { + UNCONST(DerivedI, output_const, output); + #pragma omp parallel for schedule(static) + for (int train_id = 0; train_id < probs.cols(); train_id++) + { + output.col(train_id) = -probs.col(train_id); + output(0, train_id) += 1.0; + } + } +}; + +} // namespace nplm + +#endif diff --git a/src/USCMatrix.h b/src/USCMatrix.h new file mode 100644 index 0000000..caa9553 --- /dev/null +++ b/src/USCMatrix.h @@ -0,0 +1,192 @@ +#ifndef USCMATRIX_H +#define USCMATRIX_H + +#include <Eigen/Dense> +#include "maybe_omp.h" +#include "util.h" + +namespace nplm +{ + +// is this cheating? +using Eigen::Matrix; +using Eigen::MatrixBase; +using Eigen::Dynamic; + +// USC = Uniform Sparse Columns. A USCMatrix is a sparse matrix in which +// each column has exactly k nonzero entries. This allows for a +// simpler and faster compressed representation. + +// A USCMatrix can be converted into CSC format fairly easily, by +// adding a third array [0, k, 2k, ..., nk]. However, the indices will +// not be unique. + +// We use: +// dense2 = dense1^T * sparse (output bProp, input fProp) +// dense1 = sparse * dense2^T (output computeGradient, input computeGradient) +// where: +// sparse is vocab_size x minibatch_size +// dense1 is vocab_size x embedding_dimension +// dense2 is embedding_dimension x minibatch_size + +template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_INDEX_TYPE but int is smaller +class USCMatrix +{ + +public: + Matrix<Index,Dynamic,Dynamic> indexes; + Matrix<Scalar,Dynamic,Dynamic> values; + int m_rows; + + USCMatrix() : m_rows(0) { } + + template <typename Indexes, typename Values> + USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) + : + indexes(indexes), + values(values), + m_rows(rows) + { } + + USCMatrix(Index rows, Index nnz, Index cols) + : + indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), + values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)), + m_rows(rows) + { + this->indexes.fill(-1); + } + + Index rows() const { return m_rows; } + Index cols() const { return indexes.cols(); } + + void resize(Index rows, Index nnz, Index cols) { + indexes.resize(nnz, cols); + values.resize(nnz, cols); + m_rows = rows; + } +}; + +// Dense matrix - sparse matrix product +// a is presumably very wide +template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC> +void uscgemm(double alpha, const MatrixBase<DerivedA> &a, + const USCMatrix<ScalarB,Index> &b, + const MatrixBase<DerivedC> &c_const) +{ + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + + #pragma omp parallel for + for (Index k=0; k<b.cols(); k++) + for (Index r=0; r<b.indexes.rows(); r++) + { + Index j = b.indexes(r,k); + eigen_assert(j >= 0); + eigen_assert(j < a.cols()); + c.col(k) += alpha * a.col(j) * b.values(r,k); + } +} + +// sparse matrix - dense matrix product +template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> +void uscgemm(double alpha, + const USCMatrix<ScalarA,Index> &a, + const MatrixBase<DerivedB> &b, + const MatrixBase<DerivedC> &c_const) +{ + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + + // This needs to be tuned for each system, unfortunately, + // and seems to vary a lot. A lot. + int i_blocks = omp_get_num_threads()*16; + + // Assume only one block in k direction. + // We don't need to explicitly block in the j direction. + #pragma omp parallel for + for (Index ib=0; ib<i_blocks; ib++) + for (Index j=0; j<a.cols(); j++) + for (Index r=0; r<a.indexes.rows(); r++) + { + Index i = a.indexes(r,j); + eigen_assert(i >= 0); + eigen_assert(i < c.rows()); + if (i % i_blocks == ib) + c.row(i) += alpha * a.values(r,j) * b.row(j); + } + + /* + If c.cols() is really large, then theoretically it seems like we should do: + + parallel for blocks in i direction + for blocks in j direction + pack block of a into smaller sparse matrix + for blocks in k direction + for k + for i (sparse) + for j + c(i,k) += a(i,j) * b(j,k) + + However, the copying of blocks of a doesn't seem practical for any realistic + sizes of c.cols(). + */ +} + +// Dense matrix - dense matrix product, but masked by a sparse matrix, +// that is, compute a*b only for those positions in c.indexes, and put +// them in c.values. + +// a is presumably a very tall matrix. Row-major order is preferred. +// For b, column-major is preferred. + +template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index> +void uscgemm_masked(double alpha, + const MatrixBase<DerivedA> &a, + const MatrixBase<DerivedB> &b, + USCMatrix<ScalarC,Index> &c) +{ + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + + #pragma omp parallel for + for (Index k=0; k<b.cols(); k++) + for (Index r=0; r<c.indexes.rows(); r++) + { + Index i = c.indexes(r, k); + eigen_assert(i >= 0); + eigen_assert(i < a.rows()); + c.values(r, k) += alpha * a.row(i) * b.col(k); + } +} + +// sparse matrix - dense vector product +template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> +void uscgemv(double alpha, + const USCMatrix<ScalarA,Index> &a, + const MatrixBase<DerivedB> &b, + const MatrixBase<DerivedC> &c_const) +{ + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == 1 && c.cols() == 1); + + for (Index j=0; j<a.cols(); j++) + for (Index r=0; r<a.indexes.rows(); r++) + { + Index i = a.indexes(r,j); + eigen_assert(i >= 0); + eigen_assert(i < c.rows()); + c(i) += alpha * a.values(r,j) * b(j); + } +} + +} + +#endif diff --git a/src/graphClasses.h b/src/graphClasses.h new file mode 100644 index 0000000..da5f1af --- /dev/null +++ b/src/graphClasses.h @@ -0,0 +1,60 @@ +//creating the structure of the nn in a graph that will help in performing backpropagation and forward propagation +#pragma once + +#include <cstdlib> +#include "neuralClasses.h" +#include <Eigen/Dense> + +namespace nplm +{ + +template <class X> +class Node { + public: + X * param; //what parameter is this + //vector <void *> children; + //vector <void *> parents; + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; + int minibatch_size; + + public: + Node() : param(NULL), minibatch_size(0) { } + + Node(X *input_param, int minibatch_size) + : param(input_param), + minibatch_size(minibatch_size) + { + resize(minibatch_size); + } + + void resize(int minibatch_size) + { + this->minibatch_size = minibatch_size; + if (param->n_outputs() != -1) + { + fProp_matrix.setZero(param->n_outputs(), minibatch_size); + } + if (param->n_inputs() != -1) + { + bProp_matrix.setZero(param->n_inputs(), minibatch_size); + } + } + + void resize() { resize(minibatch_size); } + + /* + void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + void Fprop(Matrix<double,1,Dynamic> & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + */ + //for f prop, just call the fProp node of the particular parameter. + +}; + +} // namespace nplm diff --git a/src/maybe_omp.h b/src/maybe_omp.h new file mode 100644 index 0000000..562dea6 --- /dev/null +++ b/src/maybe_omp.h @@ -0,0 +1,13 @@ +#ifndef MAYBE_OMP +#define MAYBE_OMP + +#ifdef _OPENMP + #include <omp.h> +#else + #define omp_get_thread_num(x) 0 + #define omp_set_num_threads(n) + #define omp_get_num_threads() 1 + #define omp_get_max_threads() 1 +#endif + +#endif diff --git a/src/model.cpp b/src/model.cpp new file mode 100644 index 0000000..3611975 --- /dev/null +++ b/src/model.cpp @@ -0,0 +1,246 @@ +#include <cstdlib> +#include <iostream> +#include <boost/lexical_cast.hpp> + +#include "model.h" +#include "param.h" + +using namespace std; +using namespace boost; +using namespace boost::random; + +namespace nplm +{ + + void model::resize(int ngram_size, + int input_vocab_size, + int output_vocab_size, + int input_embedding_dimension, + int num_hidden, + int output_embedding_dimension) +{ + input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); + first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(num_hidden); + second_hidden_linear.resize(output_embedding_dimension, num_hidden); + second_hidden_activation.resize(output_embedding_dimension); + output_layer.resize(output_vocab_size, output_embedding_dimension); + this->ngram_size = ngram_size; + this->input_vocab_size = input_vocab_size; + this->output_vocab_size = output_vocab_size; + this->input_embedding_dimension = input_embedding_dimension; + this->num_hidden = num_hidden; + this->output_embedding_dimension = output_embedding_dimension; + premultiplied = false; +} + +void model::initialize(mt19937 &init_engine, bool init_normal, double init_range, double init_bias) +{ + input_layer.initialize(init_engine, init_normal, init_range); + output_layer.initialize(init_engine, init_normal, init_range, init_bias); + first_hidden_linear.initialize(init_engine, init_normal, init_range); + second_hidden_linear.initialize(init_engine, init_normal, init_range); +} + +void model::premultiply() +{ + // Since input and first_hidden_linear are both linear, + // we can multiply them into a single linear layer *if* we are not training + int context_size = ngram_size-1; + Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; + first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + for (int i=0; i<context_size; i++) + first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); + input_layer.W->resize(1,1); // try to save some memory + premultiplied = true; +} + +void model::readConfig(ifstream &config_file) +{ + string line; + vector<string> fields; + int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; + activation_function_type activation_function = this->activation_function; + while (getline(config_file, line) && line != "") + { + splitBySpace(line, fields); + if (fields[0] == "ngram_size") + ngram_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "vocab_size") + input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "input_vocab_size") + input_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "output_vocab_size") + output_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "input_embedding_dimension") + input_embedding_dimension = lexical_cast<int>(fields[1]); + else if (fields[0] == "num_hidden") + num_hidden = lexical_cast<int>(fields[1]); + else if (fields[0] == "output_embedding_dimension") + output_embedding_dimension = lexical_cast<int>(fields[1]); + else if (fields[0] == "activation_function") + activation_function = string_to_activation_function(fields[1]); + else if (fields[0] == "version") + { + int version = lexical_cast<int>(fields[1]); + if (version != 1) + { + cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl; + exit(1); + } + } + else + cerr << "warning: unrecognized field in config: " << fields[0] << endl; + } + resize(ngram_size, + input_vocab_size, + output_vocab_size, + input_embedding_dimension, + num_hidden, + output_embedding_dimension); + set_activation_function(activation_function); +} + +void model::readConfig(const string &filename) +{ + ifstream config_file(filename.c_str()); + if (!config_file) + { + cerr << "error: could not open config file " << filename << endl; + exit(1); + } + readConfig(config_file); + config_file.close(); +} + +void model::read(const string &filename) +{ + vector<string> input_words; + vector<string> output_words; + read(filename, input_words, output_words); +} + +void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words) +{ + ifstream file(filename.c_str()); + if (!file) throw runtime_error("Could not open file " + filename); + + param myParam; + string line; + + while (getline(file, line)) + { + if (line == "\\config") + { + readConfig(file); + } + + else if (line == "\\vocab") + { + input_words.clear(); + readWordsFile(file, input_words); + output_words = input_words; + } + + else if (line == "\\input_vocab") + { + input_words.clear(); + readWordsFile(file, input_words); + } + + else if (line == "\\output_vocab") + { + output_words.clear(); + readWordsFile(file, output_words); + } + + else if (line == "\\input_embeddings") + input_layer.read(file); + else if (line == "\\hidden_weights 1") + first_hidden_linear.read(file); + else if (line == "\\hidden_weights 2") + second_hidden_linear.read(file); + else if (line == "\\output_weights") + output_layer.read_weights(file); + else if (line == "\\output_biases") + output_layer.read_biases(file); + else if (line == "\\end") + break; + else if (line == "") + continue; + else + { + cerr << "warning: unrecognized section: " << line << endl; + // skip over section + while (getline(file, line) && line != "") { } + } + } + file.close(); +} + + void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words) +{ + write(filename, &input_words, &output_words); +} + +void model::write(const string &filename) +{ + write(filename, NULL, NULL); +} + + void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords) +{ + ofstream file(filename.c_str()); + if (!file) throw runtime_error("Could not open file " + filename); + + file << "\\config" << endl; + file << "version 1" << endl; + file << "ngram_size " << ngram_size << endl; + file << "input_vocab_size " << input_vocab_size << endl; + file << "output_vocab_size " << output_vocab_size << endl; + file << "input_embedding_dimension " << input_embedding_dimension << endl; + file << "num_hidden " << num_hidden << endl; + file << "output_embedding_dimension " << output_embedding_dimension << endl; + file << "activation_function " << activation_function_to_string(activation_function) << endl; + file << endl; + + if (input_pwords) + { + file << "\\input_vocab" << endl; + writeWordsFile(*input_pwords, file); + file << endl; + } + + if (output_pwords) + { + file << "\\output_vocab" << endl; + writeWordsFile(*output_pwords, file); + file << endl; + } + + file << "\\input_embeddings" << endl; + input_layer.write(file); + file << endl; + + file << "\\hidden_weights 1" << endl; + first_hidden_linear.write(file); + file << endl; + + file << "\\hidden_weights 2" << endl; + second_hidden_linear.write(file); + file << endl; + + file << "\\output_weights" << endl; + output_layer.write_weights(file); + file << endl; + + file << "\\output_biases" << endl; + output_layer.write_biases(file); + file << endl; + + file << "\\end" << endl; + file.close(); +} + + +} // namespace nplm diff --git a/src/model.h b/src/model.h new file mode 100644 index 0000000..271b22f --- /dev/null +++ b/src/model.h @@ -0,0 +1,105 @@ +#ifndef MODEL_H +#define MODEL_H + +#include <iostream> +#include <vector> +#include <string> +#include <boost/random/mersenne_twister.hpp> + +#include "neuralClasses.h" +#include "Activation_function.h" + +namespace nplm +{ + +class model { +public: + Input_word_embeddings input_layer; + Linear_layer first_hidden_linear; + Activation_function first_hidden_activation; + Linear_layer second_hidden_linear; + Activation_function second_hidden_activation; + Output_word_embeddings output_layer; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix, + input_embedding_matrix, + input_and_output_embedding_matrix; + + activation_function_type activation_function; + int ngram_size, input_vocab_size, output_vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; + bool premultiplied; + + model(int ngram_size, + int input_vocab_size, + int output_vocab_size, + int input_embedding_dimension, + int num_hidden, + int output_embedding_dimension, + bool share_embeddings) + { + if (share_embeddings){ + input_and_output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>(); + input_layer.set_W(&input_and_output_embedding_matrix); + output_layer.set_W(&input_and_output_embedding_matrix); + } + else { + input_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>(); + output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>(); + input_layer.set_W(&input_embedding_matrix); + output_layer.set_W(&output_embedding_matrix); + } + resize(ngram_size, + input_vocab_size, + output_vocab_size, + input_embedding_dimension, + num_hidden, + output_embedding_dimension); + } + model() : ngram_size(1), + premultiplied(false), + activation_function(Rectifier), + output_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>()), + input_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>()) + { + output_layer.set_W(&output_embedding_matrix); + input_layer.set_W(&input_embedding_matrix); + } + + void resize(int ngram_size, + int input_vocab_size, + int output_vocab_size, + int input_embedding_dimension, + int num_hidden, + int output_embedding_dimension); + + void initialize(boost::random::mt19937 &init_engine, + bool init_normal, + double init_range, + double init_bias); + void set_activation_function(activation_function_type f) + { + activation_function = f; + first_hidden_activation.set_activation_function(f); + second_hidden_activation.set_activation_function(f); + } + + void premultiply(); + + // Since the vocabulary is not essential to the model, + // we need a version with and without a vocabulary. + // If the number of "extra" data structures like this grows, + // a better solution is needed + + void read(const std::string &filename); + void read(const std::string &filename, std::vector<std::string> &input_words, std::vector<std::string> &output_words); + void write(const std::string &filename, const std::vector<std::string> &input_words, const std::vector<std::string> &output_words); + void write(const std::string &filename); + + private: + void readConfig(std::ifstream &config_file); + void readConfig(const std::string &filename); + void write(const std::string &filename, const std::vector<std::string> *input_pwords, const std::vector<std::string> *output_pwords); +}; + +} //namespace nplm + +#endif diff --git a/src/multinomial.h b/src/multinomial.h new file mode 100644 index 0000000..1314fcb --- /dev/null +++ b/src/multinomial.h @@ -0,0 +1,135 @@ +#ifndef MULTINOMIAL_H +#define MULTINOMIAL_H + +#include <vector> +#include <set> +#include <cassert> +#include <cmath> + +#include <boost/random/uniform_int_distribution.hpp> +#include <boost/random/uniform_real_distribution.hpp> + +namespace nplm +{ + +template <typename Count> +class multinomial { + std::vector<int> J; + std::vector<double> q; + boost::random::uniform_int_distribution<Count> unif_int; + boost::random::uniform_real_distribution<> unif_real; + std::vector<double> m_prob, m_logprob; + +public: + multinomial() : unif_real(0.0, 1.0) { } + multinomial(const std::vector<Count> &counts) : unif_real(0.0, 1.0) { estimate(counts); } + + void estimate(const std::vector<Count>& counts) + { + int k = counts.size(); + Count n = 0; + m_prob.clear(); + m_prob.resize(k, 0.0); + m_logprob.clear(); + m_logprob.resize(k, 0.0); + for (int i=0; i<k; i++) + n += counts[i]; + for (int i=0; i<k; i++) + { + m_prob[i] = static_cast<double>(counts[i]) / n; + m_logprob[i] = std::log(m_prob[i]); + } + setup(m_prob); + } + + double prob(int i) const { return m_prob[i]; } + double logprob(int i) const { return m_logprob[i]; } + + template <typename Engine> + int sample(Engine &eng) const + { + int m = unif_int(eng); + double p = unif_real(eng); + int s; + if (q[m] > p) + s = m; + else + s = J[m]; + assert (s >= 0); + return s; + } + +private: + void setup(const std::vector<double>& probs) + { + int k = probs.size(); + + unif_int = boost::random::uniform_int_distribution<Count>(0, k-1); + J.resize(k, -1); + q.resize(k, 0); + + // "small" outcomes (prob < 1/k) + std::set<int> S; + std::set<int>::iterator s_it; + // "large" outcomes (prob >= 1/k) + std::set<int> L; + std::set<int>::iterator l_it; + const double tol = 1e-3; + + for (int i=0; i<k; i++) + { + q[i] = k*probs[i]; + if (q[i] < 1.0) + { + S.insert(i); + } + else + { + L.insert(i); + } + } + + while (S.size() > 0 && L.size() > 0) + { + // choose an arbitrary element s from S and l from L + s_it = S.begin(); + int s = *s_it; + l_it = L.begin(); + int l = *l_it; + + // pair up s and (part of) l as its alias + J[s] = l; + S.erase(s_it); + //q[l] = q[l] - (1.0 - q[s]); + q[l] = q[l] + q[s] - 1.0; // more stable? + + // move l from L to S if necessary + if (q[l] < 1.0) + { + S.insert(l); + L.erase(l_it); + } + } + + // any remaining elements must have q/n close to 1, so we leave them alone + for (s_it = S.begin(); s_it != S.end(); ++s_it) { + //assert (fabs(q[*s_it] - 1) < tol); + if (std::fabs(q[*s_it] - 1) > tol) + { + std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*s_it]-1) << std::endl; + } + q[*s_it] = 1.0; + } + for (l_it = L.begin(); l_it != L.end(); ++l_it) { + if (std::fabs(q[*l_it] - 1) > tol) + { + std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*l_it]-1) << std::endl; + } + } + } + +}; + +} // namespace nplm + +#endif diff --git a/src/neuralClasses.h b/src/neuralClasses.h new file mode 100644 index 0000000..afd91f1 --- /dev/null +++ b/src/neuralClasses.h @@ -0,0 +1,520 @@ +#pragma once +#include <iostream> +#include <fstream> +#include <algorithm> +#include <cassert> +#include <cmath> +#include <vector> + +#include <boost/unordered_map.hpp> +#include <Eigen/Dense> +#include "maybe_omp.h" + +#include "util.h" +#include "graphClasses.h" +#include "USCMatrix.h" + +// classes for various kinds of layers +#include "SoftmaxLoss.h" +#include "Activation_function.h" + +//#define EIGEN_DONT_PARALLELIZE +//#define EIGEN_DEFAULT_TO_ROW_MAJOR + +namespace nplm +{ + +// is this cheating? +using Eigen::Matrix; +using Eigen::MatrixBase; +using Eigen::Dynamic; + +typedef boost::unordered_map<int,bool> int_map; + +class Linear_layer +{ + private: + Matrix<double,Dynamic,Dynamic> U; + Matrix<double,Dynamic,Dynamic> U_gradient; + Matrix<double,Dynamic,Dynamic> U_velocity; + Matrix<double,Dynamic,Dynamic> U_running_gradient; + + friend class model; + + public: + Linear_layer() { } + Linear_layer(int rows, int cols) { resize(rows, cols); } + + void resize(int rows, int cols) + { + U.setZero(rows, cols); + U_gradient.setZero(rows, cols); + U_running_gradient.setZero(rows, cols); + U_velocity.setZero(rows, cols); + } + + void read(std::ifstream &U_file) { readMatrix(U_file, U); } + void write(std::ofstream &U_file) { writeMatrix(U, U_file); } + + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range) + { + initMatrix(engine, U, init_normal, init_range); + } + + int n_inputs () const { return U.cols(); } + int n_outputs () const { return U.rows(); } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + my_output.leftCols(input.cols()).noalias() = U*input; + } + + // Sparse input + template <typename ScalarIn, typename DerivedOut> + void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const + { + UNCONST(DerivedOut, output_const, output); + output.setZero(); + uscgemm(1.0, U, input, output.leftCols(input.cols())); + } + + template <typename DerivedGOut, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const + { + UNCONST(DerivedGIn, output, my_output); + my_output.noalias() = U.transpose()*input; + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, double momentum, double L2_reg) + { + U_gradient.noalias() = bProp_input*fProp_input.transpose(); + + // This used to be multithreaded, but there was no measureable difference + if (L2_reg > 0.0) + { + U_gradient *= 1 - 2*L2_reg; + } + if (momentum > 0.0) + { + U_velocity = momentum*U_velocity + U_gradient; + U += learning_rate * U_velocity; + } + else + { + U += learning_rate * U_gradient; + } + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, double momentum, double L2_reg) + { + U_gradient.noalias() = bProp_input*fProp_input.transpose(); + + if (L2_reg != 0) + { + U_gradient *= 1 - 2*L2_reg; + } + + // ignore momentum? + + U_running_gradient.array() += U_gradient.array().square(); + U.array() += learning_rate * U_gradient.array() / U_running_gradient.array().sqrt(); + } + + template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + const MatrixBase<DerivedGW> &gradient) const + { + UNCONST(DerivedGW, gradient, my_gradient); + my_gradient.noalias() = bProp_input*fProp_input.transpose(); + } +}; + +class Output_word_embeddings +{ + private: + // row-major is better for uscgemm + //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W; + // Having W be a pointer to a matrix allows ease of sharing + // input and output word embeddings + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + std::vector<double> W_data; + Matrix<double,Dynamic,1> b; + Matrix<double,Dynamic,Dynamic> W_running_gradient; + Matrix<double,Dynamic,Dynamic> W_gradient; + Matrix<double,Dynamic,1> b_running_gradient; + Matrix<double,Dynamic,1> b_gradient; + + public: + Output_word_embeddings() { } + Output_word_embeddings(int rows, int cols) { resize(rows, cols); } + + void resize(int rows, int cols) + { + W->setZero(rows, cols); + b.setZero(rows); + } + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } + void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); } + void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); } + void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } + void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } + + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range, double init_bias) + { + initMatrix(engine, *W, init_normal, init_range); + b.fill(init_bias); + } + + int n_inputs () const { return W->cols(); } + int n_outputs () const { return W->rows(); } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + my_output = ((*W) * input).colwise() + b; + } + + // Sparse output version + template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOutI> &samples, + const MatrixBase<DerivedOutV> &output) const + { + UNCONST(DerivedOutV, output, my_output); + #pragma omp parallel for + for (int instance_id = 0; instance_id < samples.cols(); instance_id++) + for (int sample_id = 0; sample_id < samples.rows(); sample_id++) + my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); + USCMatrix<double> sparse_output(W->rows(), samples, my_output); + uscgemm_masked(1.0, *W, input, sparse_output); + my_output = sparse_output.values; // too bad, so much copying + } + + // Return single element of output matrix + template <typename DerivedIn> + double fProp(const MatrixBase<DerivedIn> &input, + int word, + int instance) const + { + return W->row(word).dot(input.col(instance)) + b(word); + } + + // Dense versions (for log-likelihood loss) + + template <typename DerivedGOut, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix, + const MatrixBase<DerivedGIn> &bProp_matrix) const + { + // W is vocab_size x output_embedding_dimension + // input_bProp_matrix is vocab_size x minibatch_size + // bProp_matrix is output_embedding_dimension x minibatch_size + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = + W->transpose() * input_bProp_matrix; + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate, + double momentum) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_size + + W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose(); + b += learning_rate * bProp_input.rowwise().sum(); + } + + // Sparse versions + + template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + const MatrixBase<DerivedGIn> &bProp_matrix) const + { + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.setZero(); + uscgemm(1.0, + W->transpose(), + USCMatrix<double>(W->rows(), samples, weights), + my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch + } + + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, double momentum) //not sure if we want to use momentum here + { + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(learning_rate, + gradient_output, + predicted_embeddings.leftCols(gradient_output.cols()).transpose(), + *W); // narrow predicted_embeddings for possible short minibatch + uscgemv(learning_rate, + gradient_output, + Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), + b); + } + + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, double momentum) //not sure if we want to use momentum here + { + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) + W_running_gradient.setZero(W->rows(), W->cols()); + if (b_running_gradient.size() != b.size()) + b_running_gradient.setZero(b.size()); + + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(learning_rate, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(learning_rate, gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square(); + b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item); + W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt(); + b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + } + } + + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> + void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + const MatrixBase<DerivedGW> &gradient_W, + const MatrixBase<DerivedGb> &gradient_b) const + { + UNCONST(DerivedGW, gradient_W, my_gradient_W); + UNCONST(DerivedGb, gradient_b, my_gradient_b); + my_gradient_W.setZero(); + my_gradient_b.setZero(); + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + my_gradient_W); + uscgemv(1.0, gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); + } +}; + +class Input_word_embeddings +{ + private: + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + int context_size, vocab_size; + Matrix<double,Dynamic,Dynamic> W_running_gradient; + Matrix<double,Dynamic,Dynamic> W_gradient; + + friend class model; + + public: + Input_word_embeddings() : context_size(0), vocab_size(0) { } + Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } + + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } + + void resize(int rows, int cols, int context) + { + context_size = context; + vocab_size = rows; + W->setZero(rows, cols); + } + + void read(std::ifstream &W_file) { readMatrix(W_file, *W); } + void write(std::ofstream &W_file) { writeMatrix(*W, W_file); } + + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range) + { + initMatrix(engine, + *W, + init_normal, + init_range); + } + + int n_inputs() const { return -1; } + int n_outputs() const { return W->cols() * context_size; } + + // set output_id's embedding to the weighted average of all embeddings + template <typename Dist> + void average(const Dist &dist, int output_id) + { + W->row(output_id).setZero(); + for (int i=0; i < W->rows(); i++) + if (i != output_id) + W->row(output_id) += dist.prob(i) * W->row(i); + } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + int embedding_dimension = W->cols(); + + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // output is ngram_size*embedding_dimension x minibatch_size + + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); + */ + + UNCONST(DerivedOut, output, my_output); + my_output.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + { + // input might be narrower than expected due to a short minibatch, + // so narrow output to match + uscgemm(1.0, + W->transpose(), + USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), + my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); + } + } + + // When model is premultiplied, this layer doesn't get used, + // but this method is used to get the input into a sparse matrix. + // Hopefully this can get eliminated someday + template <typename DerivedIn, typename ScalarOut> + void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const + { + output.resize(vocab_size*context_size, context_size, input.cols()); + for (int i=0; i < context_size; i++) + output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; + output.values.fill(1.0); + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, double momentum, double L2_reg) + { + int embedding_dimension = W->cols(); + + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // bProp_input is ngram_size*embedding_dimension x minibatch_size + + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() + */ + + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(learning_rate, + USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), + *W); + } + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, double momentum, double L2_reg) + { + int embedding_dimension = W->cols(); + + W_gradient.setZero(W->rows(), W->cols()); + if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) + W_running_gradient.setZero(W->rows(), W->cols()); + + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(learning_rate, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + + int_map update_map; //stores all the parameters that have been updated + + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(train_id)] = 1; + } + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + { + update_items.push_back(it->first); + } + int num_items = update_items.size(); + + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square(); + W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt(); + } + } + + template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + int x, int minibatch_size, + const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + { + UNCONST(DerivedGW, gradient, my_gradient); + int embedding_dimension = W->cols(); + my_gradient.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + my_gradient); + } +}; + +} // namespace nplm diff --git a/src/neuralLM.cpp b/src/neuralLM.cpp new file mode 100644 index 0000000..19d84e8 --- /dev/null +++ b/src/neuralLM.cpp @@ -0,0 +1 @@ +#include "neuralLM.h" diff --git a/src/neuralLM.h b/src/neuralLM.h new file mode 100644 index 0000000..0c54bfd --- /dev/null +++ b/src/neuralLM.h @@ -0,0 +1,350 @@ +#ifndef NEURALLM_H +#define NEURALLM_H + +#include <vector> +#include <iostream> +#include <fstream> +#include <memory> +#include <stdexcept> +#include <cctype> +#include <cstdlib> +#include <boost/lexical_cast.hpp> + +#include <Eigen/Dense> + +#include "param.h" +#include "util.h" +#include "model.h" +#include "propagator.h" +#include "neuralClasses.h" +#include "vocabulary.h" + +namespace nplm +{ + +class neuralLM +{ + bool normalization; + char map_digits; + + vocabulary input_vocab, output_vocab; + model nn; + propagator prop; + + int ngram_size; + int width; + + double weight; + +private: + std::size_t cache_size; + Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; + std::vector<double> cache_values; + int cache_lookups, cache_hits; + + Eigen::Matrix<int,Eigen::Dynamic,1> ngram; // buffer for lookup_ngram + int start, null; + +public: + neuralLM() + : ngram_size(1), + normalization(false), + weight(1.), + map_digits(0), + width(1), + prop(nn, 1), + cache_size(0) + { + } + + void set_normalization(bool value) { normalization = value; } + void set_log_base(double value) { weight = 1./std::log(value); } + void set_map_digits(char value) { map_digits = value; } + + // This must be called if the underlying model is resized. + void resize() { + ngram_size = nn.ngram_size; + ngram.setZero(ngram_size); + if (cache_size) + { + cache_keys.resize(ngram_size, cache_size); + cache_keys.fill(-1); + } + prop.resize(); + } + + void set_width(int width) + { + this->width = width; + prop.resize(width); + } + + void set_input_vocabulary(const vocabulary &vocab) + { + this->input_vocab = vocab; + start = input_vocab.lookup_word("<s>"); + null = input_vocab.lookup_word("<null>"); + } + + void set_output_vocabulary(const vocabulary &vocab) + { + this->output_vocab = vocab; + } + + const vocabulary &get_vocabulary() const { return this->input_vocab; } + + int lookup_input_word(const std::string &word) const + { + if (map_digits) + for (int i=0; i<word.length(); i++) + if (isdigit(word[i])) + { + std::string mapped_word(word); + for (; i<word.length(); i++) + if (isdigit(word[i])) + mapped_word[i] = map_digits; + return input_vocab.lookup_word(mapped_word); + } + return input_vocab.lookup_word(word); + } + + int lookup_word(const std::string &word) const + { + return lookup_input_word(word); + } + + int lookup_output_word(const std::string &word) const + { + if (map_digits) + for (int i=0; i<word.length(); i++) + if (isdigit(word[i])) + { + std::string mapped_word(word); + for (; i<word.length(); i++) + if (isdigit(word[i])) + mapped_word[i] = map_digits; + return output_vocab.lookup_word(mapped_word); + } + return output_vocab.lookup_word(word); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + assert (ngram.rows() == ngram_size); + assert (ngram.cols() == 1); + + std::size_t hash; + if (cache_size) + { + // First look in cache + hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h + cache_lookups++; + if (cache_keys.col(hash) == ngram) + { + cache_hits++; + return cache_values[hash]; + } + } + + // Make sure that we're single threaded. Multithreading doesn't help, + // and in some cases can hurt quite a lot + int save_threads = omp_get_max_threads(); + omp_set_num_threads(1); + int save_eigen_threads = Eigen::nbThreads(); + Eigen::setNbThreads(1); + #ifdef __INTEL_MKL__ + int save_mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); + #endif + + prop.fProp(ngram.col(0)); + + int output = ngram(ngram_size-1, 0); + double log_prob; + + start_timer(3); + if (normalization) + { + Eigen::Matrix<double,Eigen::Dynamic,1> scores(output_vocab.size()); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + double logz = logsum(scores.col(0)); + log_prob = weight * (scores(output, 0) - logz); + } + else + { + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); + } + stop_timer(3); + + if (cache_size) + { + // Update cache + cache_keys.col(hash) = ngram; + cache_values[hash] = log_prob; + } + + #ifdef __INTEL_MKL__ + mkl_set_num_threads(save_mkl_threads); + #endif + Eigen::setNbThreads(save_eigen_threads); + omp_set_num_threads(save_threads); + + return log_prob; + } + + // Look up many n-grams in parallel. + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + UNCONST(DerivedB, log_probs_const, log_probs); + assert (ngram.rows() == ngram_size); + assert (ngram.cols() <= width); + + prop.fProp(ngram); + + if (normalization) + { + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(output_vocab.size(), ngram.cols()); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + + // And softmax and loss + Matrix<double,Dynamic,Dynamic> output_probs(nn.output_vocab_size, ngram.cols()); + double minibatch_log_likelihood; + SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(nn.ngram_size-1), output_probs, minibatch_log_likelihood); + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(ngram_size-1, j); + log_probs(0, j) = weight * output_probs(output, j); + } + } + else + { + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(ngram_size-1, j); + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + } + } + } + + double lookup_ngram(const int *ngram_a, int n) + { + for (int i=0; i<ngram_size; i++) + { + if (i-ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-ngram_size+n]; + } + } + return lookup_ngram(ngram); + } + + double lookup_ngram(const std::vector<int> &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } + + int get_order() const { return ngram_size; } + + void read(const std::string &filename) + { + std::vector<std::string> input_words; + std::vector<std::string> output_words; + nn.read(filename, input_words, output_words); + set_input_vocabulary(vocabulary(input_words)); + set_output_vocabulary(vocabulary(output_words)); + resize(); + // this is faster but takes more memory + //nn.premultiply(); + } + + void set_cache(std::size_t cache_size) + { + this->cache_size = cache_size; + cache_keys.resize(ngram_size, cache_size); + cache_keys.fill(-1); // clears cache + cache_values.resize(cache_size); + cache_lookups = cache_hits = 0; + } + + double cache_hit_rate() + { + return static_cast<double>(cache_hits)/cache_lookups; + } + +}; + +template <typename T> +void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop) +{ + output.clear(); + output.resize(input.size()+ngram_size); + for (int i=0; i<ngram_size-1; i++) + output[i] = start; + std::copy(input.begin(), input.end(), output.begin()+ngram_size-1); + output[output.size()-1] = stop; +} + +template <typename T> +void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &output, int ngram_size) +{ + output.clear(); + for (int j=ngram_size-1; j<input.size(); j++) + { + std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); + output.push_back(ngram); + } +} + +inline void preprocessWords(const std::vector<std::string> &words, std::vector< std::vector<int> > &ngrams, + int ngram_size, const vocabulary &vocab, + bool numberize, bool add_start_stop, bool ngramize) +{ + int start = vocab.lookup_word("<s>"); + int stop = vocab.lookup_word("</s>"); + + // convert words to ints + std::vector<int> nums; + if (numberize) { + for (int j=0; j<words.size(); j++) { + nums.push_back(vocab.lookup_word(words[j])); + } + } + else { + for (int j=0; j<words.size(); j++) { + nums.push_back(boost::lexical_cast<int>(words[j])); + } + } + + // convert sequence to n-grams + ngrams.clear(); + if (ngramize) { + std::vector<int> snums; + if (add_start_stop) { + addStartStop<int>(nums, snums, ngram_size, start, stop); + } else { + snums = nums; + } + makeNgrams(snums, ngrams, ngram_size); + } + else { + if (nums.size() != ngram_size) + { + std::cerr << "error: wrong number of fields in line" << std::endl; + std::exit(1); + } + ngrams.push_back(nums); + } +} + +} // namespace nplm + +#endif diff --git a/src/param.h b/src/param.h new file mode 100644 index 0000000..b303514 --- /dev/null +++ b/src/param.h @@ -0,0 +1,58 @@ +#pragma once + +#include <string> + +namespace nplm +{ + +struct param +{ + std::string train_file; + std::string validation_file; + std::string test_file; + + std::string model_file; + + std::string unigram_probs_file; + std::string words_file; + std::string input_words_file; + std::string output_words_file; + std::string model_prefix; + + int ngram_size; + int vocab_size; + int input_vocab_size; + int output_vocab_size; + int num_hidden; + int embedding_dimension; + int input_embedding_dimension; + int output_embedding_dimension; + std::string activation_function; + std::string loss_function; + + int minibatch_size; + int validation_minibatch_size; + int num_epochs; + double learning_rate; + + bool init_normal; + double init_range; + + int num_noise_samples; + + bool use_momentum; + double initial_momentum; + double final_momentum; + + double L2_reg; + + bool normalization; + double normalization_init; + + int num_threads; + + bool share_embeddings; + +}; + +} // namespace nplm diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp new file mode 100644 index 0000000..94482d0 --- /dev/null +++ b/src/prepareNeuralLM.cpp @@ -0,0 +1,246 @@ +#include <iostream> +#include <vector> +#include <queue> +#include <boost/unordered_map.hpp> +#include <tclap/CmdLine.h> +#include <boost/algorithm/string/join.hpp> + +#include "neuralLM.h" +#include "util.h" + +using namespace std; +using namespace TCLAP; +using namespace boost; +using namespace nplm; + +void writeNgrams(const vector<vector<string> > &data, + int ngram_size, const vocabulary &vocab, + bool numberize, bool add_start_stop, bool ngramize, + const string &filename) +{ + ofstream file(filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + vector<vector<int> > ngrams; + for (int i=0; i<data.size(); i++) { + preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize); + // write out n-grams + for (int j=0; j<ngrams.size(); j++) + { + for (int k=0; k<ngram_size; k++) + { + file << ngrams[j][k] << " "; + } + file << endl; + } + } + file.close(); +} + +int main(int argc, char *argv[]) +{ + int ngram_size, vocab_size, validation_size; + bool numberize, ngramize, add_start_stop; + string train_text, train_file, validation_text, validation_file, words_file, write_words_file; + + try + { + CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1"); + + // The options are printed in reverse order + + ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); + + ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd); + ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); + ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd); + ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); + ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd); + ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); + + cmd.parse(argc, argv); + + train_text = arg_train_text.getValue(); + train_file = arg_train_file.getValue(); + validation_text = arg_validation_text.getValue(); + validation_file = arg_validation_file.getValue(); + validation_size = arg_validation_size.getValue(); + write_words_file = arg_write_words_file.getValue(); + ngram_size = arg_ngram_size.getValue(); + vocab_size = arg_vocab_size.getValue(); + words_file = arg_words_file.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); + + // check command line arguments + + // Notes: + // - either --words_file or --vocab_size is required. + // - if --words_file is set, + // - if --vocab_size is not set, it is inferred from the length of the file + // - if --vocab_size is set, it is an error if the vocab file has a different number of lines + // - if --numberize 0 is set and --words_file f is not set, then the output model file will not have a vocabulary, and a warning should be printed. + + // Notes: + // - if --ngramize 0 is set, then + // - if --ngram_size is not set, it is inferred from the training file (different from current) + // - if --ngram_size is set, it is an error if the training file has a different n-gram size + // - if neither --validation_file or --validation_size is set, validation will not be performed. + // - if --numberize 0 is set, then --validation_size cannot be used. + + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; + + const string sep(" Value: "); + cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; + cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; + cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; + cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; + cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; + cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl; + cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; + cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl; + cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl; + cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + // VLF: why is this true? + // DC: it's because the vocabulary has to be constructed from the training data only. + // If the vocabulary is preset, we can't create the validation data. + // - if --numberize 0 is set, then --validation_size cannot be used. + // if (!numberize && (validation_size > 0)) { + // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; + // } + + // Read in training data and validation data + vector<vector<string> > train_data; + readSentFile(train_text, train_data); + for (int i=0; i<train_data.size(); i++) { + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + if (ngram_size > 0) { + if (ngram_size != train_data[i].size()) { + cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; + } + } + // else if --ngram_size has not been specified, set it now + else { + ngram_size=train_data[i].size(); + } + } + } + + vector<vector<string> > validation_data; + if (validation_text != "") { + readSentFile(validation_text, validation_data); + for (int i=0; i<validation_data.size(); i++) { + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + // if --ngram_size has been specified, check that it does not conflict with --ngram_size + if (ngram_size > 0) { + if (ngram_size != validation_data[i].size()) { + cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; + } + } + // else if --ngram_size has not been specified, set it now + else { + ngram_size=validation_data[i].size(); + } + } + } + } + else if (validation_size > 0) + { + // Create validation data + if (validation_size > train_data.size()) + { + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); + } + validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); + train_data.resize(train_data.size() - validation_size); + } + + // Construct vocabulary + vocabulary vocab; + int start, stop; + + // read vocabulary from file + if (words_file != "") { + vector<string> words; + readWordsFile(words_file,words); + for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { + vocab.insert_word(*it); + } + + // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file + if (vocab_size > 0) { + if (vocab.size() != vocab_size) { + cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; + } + } + // else, set it to the size of vocabulary read from file + else { + vocab_size = vocab.size(); + } + + } + + // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk> + else { + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); + + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; + } + unordered_map<string,int> count; + for (int i=0; i<train_data.size(); i++) { + for (int j=0; j<train_data[i].size(); j++) { + count[train_data[i][j]] += 1; + } + } + + vocab.insert_most_frequent(count, vocab_size); + if (vocab.size() < vocab_size) { + cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; + } + } + + // write vocabulary to file + if (write_words_file != "") { + cerr << "Writing vocabulary to " << write_words_file << endl; + writeWordsFile(vocab.words(), write_words_file); + } + + // Write out numberized n-grams + if (train_file != "") + { + cerr << "Writing training data to " << train_file << endl; + writeNgrams(train_data, ngram_size, vocab, numberize, add_start_stop, ngramize, train_file); + } + if (validation_file != "") + { + cerr << "Writing validation data to " << validation_file << endl; + writeNgrams(validation_data, ngram_size, vocab, numberize, add_start_stop, ngramize, validation_file); + } +} diff --git a/src/prepareNeuralTM.cpp b/src/prepareNeuralTM.cpp new file mode 100644 index 0000000..8d7cbf8 --- /dev/null +++ b/src/prepareNeuralTM.cpp @@ -0,0 +1,396 @@ +#include <iostream> +#include <vector> +#include <queue> +#include <boost/unordered_map.hpp> +#include <tclap/CmdLine.h> +#include <boost/algorithm/string/join.hpp> + +using namespace std; +using namespace TCLAP; + +#include "neuralLM.h" // for vocabulary +#include "util.h" + +using namespace boost; +using namespace nplm; + +void writeNgrams(const vector<vector<string> > &input_data, const vector<vector<string> > &output_data, int ngram_size, const vocabulary &input_vocab, const vocabulary &output_vocab, bool numberize, bool ngramize, const string &filename) +{ + ofstream file(filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + // check that input and output data have the same number of sentences + if (input_data.size() != output_data.size()) { + cerr << "Error: input and output data files have different number of lines" << endl; + exit(1); + } + + // for each input and output line + int lines=input_data.size(); + if (numberize) { + for (int i=0; i<lines; i++) { + // convert each line to a set of ngrams + vector<vector<int> > input_ngrams; + vector<int> input_nums; + for (int j=0; j<input_data[i].size(); j++) { + input_nums.push_back(input_vocab.lookup_word(input_data[i][j])); + } + makeNgrams(input_nums, input_ngrams, ngram_size-1); + + vector<vector<int> > output_ngrams; + vector<int> output_nums; + for (int j=0; j<output_data[i].size(); j++) { + output_nums.push_back(output_vocab.lookup_word(output_data[i][j])); + } + makeNgrams(output_nums, output_ngrams, 1); + + // print out cross product of input and output ngrams + for (int j=0; j < input_ngrams.size(); j++) { + for (int k=0; k < output_ngrams.size(); k++) { + int j_prime; + for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) { + file << input_ngrams[j][j_prime] << " "; + } + file << input_ngrams[j][j_prime]; + int k_prime; + for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) { + file << " " << output_ngrams[k][k_prime]; + } + file << endl; + } + } + } + } + + else { + for (int i=0; i<lines; i++) { + // convert each line to a set of ngrams + vector<vector<string> > input_ngrams; + vector<string> input_words; + for (int j=0; j<input_data[i].size(); j++) { + int unk = input_vocab.lookup_word("<unk>"); + // if word is unknown + if (input_vocab.lookup_word(input_data[i][j]) == unk) { + input_words.push_back("<unk>"); + } + // if word is known + else { + input_words.push_back(input_data[i][j]); + } + } + makeNgrams(input_words, input_ngrams, ngram_size-1); + + vector<vector<string> > output_ngrams; + vector<string> output_words; + for (int j=0; j<output_data[i].size(); j++) { + int unk = output_vocab.lookup_word("<unk>"); + // if word is unknown + if (output_vocab.lookup_word(output_data[i][j]) == unk) { + output_words.push_back("<unk>"); + } + // if word is known + else { + output_words.push_back(output_data[i][j]); + } + } + makeNgrams(output_words, output_ngrams, 1); + + // print out cross product of input and output ngrams + for (int j=0; j < input_ngrams.size(); j++) { + for (int k=0; k < output_ngrams.size(); k++) { + int j_prime; + for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) { + file << input_ngrams[j][j_prime] << " "; + } + file << input_ngrams[j][j_prime]; + int k_prime; + for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) { + file << " " << output_ngrams[k][k_prime]; + } + file << endl; + } + } + } + } + file.close(); +} + +int main(int argc, char *argv[]) +{ + int ngram_size, input_vocab_size, output_vocab_size, validation_size; + bool add_start_stop, numberize, ngramize; + string input_train_text, output_train_text, train_file, input_validation_text, output_validation_text, validation_file, write_input_words_file, write_output_words_file, input_words_file, output_words_file; + + try + { + CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1"); + + // The options are printed in reverse order + + ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend (ngram_size-1) start symbols and postpend 1 stop symbol. Default: true.", false, true, "bool", cmd); + ValueArg<int> arg_input_vocab_size("", "input_vocab_size", "Vocabulary size.", false, -1, "int", cmd); + ValueArg<int> arg_output_vocab_size("", "output_vocab_size", "Vocabulary size.", false, -1, "int", cmd); + ValueArg<string> arg_input_words_file("", "input_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); + ValueArg<string> arg_output_words_file("", "output_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); + ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd); + ValueArg<string> arg_write_input_words_file("", "write_input_words_file", "Output vocabulary.", false, "", "string", cmd); + ValueArg<string> arg_write_output_words_file("", "write_output_words_file", "Output vocabulary.", false, "", "string", cmd); + ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd); + ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_input_validation_text("", "input_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg<string> arg_output_validation_text("", "output_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_input_train_text("", "input_train_text", "Input training data (tokenized).", true, "", "string", cmd); + ValueArg<string> arg_output_train_text("", "output_train_text", "Input training data (tokenized).", true, "", "string", cmd); + + cmd.parse(argc, argv); + + input_train_text = arg_input_train_text.getValue(); + output_train_text = arg_output_train_text.getValue(); + train_file = arg_train_file.getValue(); + validation_file = arg_validation_file.getValue(); + input_validation_text = arg_input_validation_text.getValue(); + output_validation_text = arg_output_validation_text.getValue(); + input_validation_text = arg_input_validation_text.getValue(); + output_validation_text = arg_output_validation_text.getValue(); + validation_size = arg_validation_size.getValue(); + write_input_words_file = arg_write_input_words_file.getValue(); + write_output_words_file = arg_write_output_words_file.getValue(); + ngram_size = arg_ngram_size.getValue(); + input_vocab_size = arg_input_vocab_size.getValue(); + output_vocab_size = arg_output_vocab_size.getValue(); + input_words_file = arg_input_words_file.getValue(); + output_words_file = arg_output_words_file.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); + + // check command line arguments + + // Notes: + // - either --words_file or --vocab_size is required. + // - if --words_file is set, + // - if --vocab_size is not set, it is inferred from the length of the file + // - if --vocab_size is set, it is an error if the vocab file has a different number of lines + // - if --numberize 0 is set and --use_vocab f is not set, then the output model file will not have a vocabulary, and a warning should be printed. + if ((input_words_file == "") && (input_vocab_size == -1)) { + cerr << "Error: either --input_words_file or --input_vocab_size is required." << endl; + exit(1); + } + if ((output_words_file == "") && (output_vocab_size == -1)) { + cerr << "Error: either --output_words_file or --output_vocab_size is required." << endl; + exit(1); + } + + // Notes: + // - if --ngramize 0 is set, then + // - if --ngram_size is not set, it is inferred from the training file (different from current) + // - if --ngram_size is set, it is an error if the training file has a different n-gram size + // - if neither --validation_file or --validation_size is set, validation will not be performed. + // - if --numberize 0 is set, then --validation_size cannot be used. + + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; + + const string sep(" Value: "); + cerr << arg_input_train_text.getDescription() << sep << arg_input_train_text.getValue() << endl; + cerr << arg_output_train_text.getDescription() << sep << arg_output_train_text.getValue() << endl; + cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; + cerr << arg_input_validation_text.getDescription() << sep << arg_input_validation_text.getValue() << endl; + cerr << arg_output_validation_text.getDescription() << sep << arg_output_validation_text.getValue() << endl; + cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; + cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; + cerr << arg_write_input_words_file.getDescription() << sep << arg_write_input_words_file.getValue() << endl; + cerr << arg_write_output_words_file.getDescription() << sep << arg_write_output_words_file.getValue() << endl; + cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; + cerr << arg_input_vocab_size.getDescription() << sep << arg_input_vocab_size.getValue() << endl; + cerr << arg_output_vocab_size.getDescription() << sep << arg_output_vocab_size.getValue() << endl; + cerr << arg_input_words_file.getDescription() << sep << arg_input_words_file.getValue() << endl; + cerr << arg_output_words_file.getDescription() << sep << arg_output_words_file.getValue() << endl; + cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + // Read in input training data and validation data + vector<vector<string> > input_train_data; + readSentFile(input_train_text, input_train_data); + if (add_start_stop) { + for (int i=0; i<input_train_data.size(); i++) { + vector<string> input_train_data_start_stop; + addStartStop<string>(input_train_data[i], input_train_data_start_stop, ngram_size, "<s>", "</s>"); + input_train_data[i]=input_train_data_start_stop; + } + } + + vector<vector<string> > input_validation_data; + if (input_validation_text != "") { + readSentFile(input_validation_text, input_validation_data); + if (add_start_stop) { + for (int i=0; i<input_validation_data.size(); i++) { + vector<string> input_validation_data_start_stop; + addStartStop<string>(input_validation_data[i], input_validation_data_start_stop, ngram_size, "<s>", "</s>"); + input_validation_data[i]=input_validation_data_start_stop; + } + } + } + else if (validation_size > 0) + { + if (validation_size > input_train_data.size()) + { + cerr << "error: requested input_validation size is greater than training data size" << endl; + exit(1); + } + input_validation_data.insert(input_validation_data.end(), input_train_data.end()-validation_size, input_train_data.end()); + input_train_data.resize(input_train_data.size() - validation_size); + } + + // Read in output training data and validation data + vector<vector<string> > output_train_data; + readSentFile(output_train_text, output_train_data); + if (add_start_stop) { + for (int i=0; i<output_train_data.size(); i++) { + vector<string> output_train_data_start_stop; + addStartStop<string>(output_train_data[i], output_train_data_start_stop, 1, "<s>", "</s>"); + output_train_data[i]=output_train_data_start_stop; + } + } + + vector<vector<string> > output_validation_data; + if (output_validation_text != "") { + readSentFile(output_validation_text, output_validation_data); + if (add_start_stop) { + for (int i=0; i<output_validation_data.size(); i++) { + vector<string> output_validation_data_start_stop; + addStartStop<string>(output_validation_data[i], output_validation_data_start_stop, 1, "<s>", "</s>"); + output_validation_data[i]=output_validation_data_start_stop; + } + } + } + else if (validation_size > 0) + { + if (validation_size > output_train_data.size()) + { + cerr << "error: requested output_validation size is greater than training data size" << endl; + exit(1); + } + output_validation_data.insert(output_validation_data.end(), output_train_data.end()-validation_size, output_train_data.end()); + output_train_data.resize(output_train_data.size() - validation_size); + } + + // Construct input vocabulary + vocabulary input_vocab; + int input_start = input_vocab.insert_word("<s>"); + int input_stop = input_vocab.insert_word("</s>"); + input_vocab.insert_word("<null>"); + + // read input vocabulary from file + if (input_words_file != "") { + vector<string> words; + readWordsFile(input_words_file,words); + for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { + input_vocab.insert_word(*it); + } + // was input_vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file + if (input_vocab_size > 0) { + if (input_vocab.size() != input_vocab_size) { + cerr << "Error: size of input_vocabulary file " << input_vocab.size() << " != --input_vocab_size " << input_vocab_size << endl; + } + } + // else, set it to the size of vocabulary read from file + else { + input_vocab_size = input_vocab.size(); + } + } + + // or construct input vocabulary to contain top <input_vocab_size> most frequent words; all other words replaced by <unk> + else { + unordered_map<string,int> count; + for (int i=0; i<input_train_data.size(); i++) { + for (int j=0; j<input_train_data[i].size(); j++) { + count[input_train_data[i][j]] += 1; + } + } + + input_vocab.insert_most_frequent(count, input_vocab_size); + if (input_vocab.size() < input_vocab_size) { + cerr << "warning: fewer than " << input_vocab_size << " types in training data; the unknown word will not be learned" << endl; + } + } + + // Construct output vocabulary + vocabulary output_vocab; + int output_start = output_vocab.insert_word("<s>"); + int output_stop = output_vocab.insert_word("</s>"); + output_vocab.insert_word("<null>"); + + // read output vocabulary from file + if (output_words_file != "") { + vector<string> words; + readWordsFile(output_words_file,words); + for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { + output_vocab.insert_word(*it); + } + // was output_vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file + if (output_vocab_size > 0) { + if (output_vocab.size() != output_vocab_size) { + cerr << "Error: size of output_vocabulary file " << output_vocab.size() << " != --output_vocab_size " << output_vocab_size << endl; + } + } + // else, set it to the size of vocabulary read from file + else { + output_vocab_size = output_vocab.size(); + } + } + + // or construct output vocabulary to contain top <output_vocab_size> most frequent words; all other words replaced by <unk> + else { + unordered_map<string,int> count; + for (int i=0; i<output_train_data.size(); i++) { + for (int j=0; j<output_train_data[i].size(); j++) { + count[output_train_data[i][j]] += 1; + } + } + + output_vocab.insert_most_frequent(count, output_vocab_size); + if (output_vocab.size() < output_vocab_size) { + cerr << "warning: fewer than " << output_vocab_size << " types in training data; the unknown word will not be learned" << endl; + } + } + + // write input vocabulary to file + if (write_input_words_file != "") { + cerr << "Writing vocabulary to " << write_input_words_file << endl; + writeWordsFile(input_vocab.words(), write_input_words_file); + } + + // write output vocabulary to file + if (write_output_words_file != "") { + cerr << "Writing vocabulary to " << write_output_words_file << endl; + writeWordsFile(output_vocab.words(), write_output_words_file); + } + + // Write out input and output numberized n-grams + if (train_file != "") + { + cerr << "Writing training data to " << train_file << endl; + writeNgrams(input_train_data, output_train_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, train_file); + + } + if (validation_file != "") + { + cerr << "Writing validation data to " << validation_file << endl; + writeNgrams(input_validation_data, output_validation_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, validation_file); + } +} diff --git a/src/propagator.h b/src/propagator.h new file mode 100644 index 0000000..b79e84a --- /dev/null +++ b/src/propagator.h @@ -0,0 +1,194 @@ +#ifndef NETWORK_H +#define NETWORK_H + +#include "neuralClasses.h" +#include "util.h" + +namespace nplm +{ + +// is this cheating? +using Eigen::Matrix; +using Eigen::MatrixBase; +using Eigen::Dynamic; + +class propagator { + int minibatch_size; + model *pnn; + +public: + Node<Input_word_embeddings> input_layer_node; + Node<Linear_layer> first_hidden_linear_node; + Node<Activation_function> first_hidden_activation_node; + Node<Linear_layer> second_hidden_linear_node; + Node<Activation_function> second_hidden_activation_node; + Node<Output_word_embeddings> output_layer_node; + +public: + propagator () : minibatch_size(0), pnn(0) { } + + propagator (model &nn, int minibatch_size) + : + pnn(&nn), + input_layer_node(&nn.input_layer, minibatch_size), + first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), + first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), + second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), + second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), + output_layer_node(&nn.output_layer, minibatch_size), + minibatch_size(minibatch_size) + { + } + + // This must be called if the underlying model is resized. + void resize(int minibatch_size) { + this->minibatch_size = minibatch_size; + input_layer_node.resize(minibatch_size); + first_hidden_linear_node.resize(minibatch_size); + first_hidden_activation_node.resize(minibatch_size); + second_hidden_linear_node.resize(minibatch_size); + second_hidden_activation_node.resize(minibatch_size); + output_layer_node.resize(minibatch_size); + } + + void resize() { resize(minibatch_size); } + + template <typename Derived> + void fProp(const MatrixBase<Derived> &data) + { + if (!pnn->premultiplied) + { + start_timer(0); + input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); + stop_timer(0); + + start_timer(1); + first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, + first_hidden_linear_node.fProp_matrix); + } + else + { + int n_inputs = first_hidden_linear_node.param->n_inputs(); + USCMatrix<double> sparse_data; + input_layer_node.param->munge(data, sparse_data); + + start_timer(1); + first_hidden_linear_node.param->fProp(sparse_data, + first_hidden_linear_node.fProp_matrix); + } + first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + stop_timer(1); + + + start_timer(2); + second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, + second_hidden_linear_node.fProp_matrix); + second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + stop_timer(2); + + // The propagation stops here because the last layer is very expensive. + } + + // Dense version (for standard log-likelihood) + template <typename DerivedIn, typename DerivedOut> + void bProp(const MatrixBase<DerivedIn> &data, + const MatrixBase<DerivedOut> &output, + double learning_rate, double momentum, double L2_reg) + { + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(output, + output_layer_node.bProp_matrix); + stop_timer(7); + + start_timer(8); + output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + output, + learning_rate, momentum); + stop_timer(8); + + bPropRest(data, learning_rate, momentum, L2_reg); + } + + // Sparse version (for NCE log-likelihood) + template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> + void bProp(const MatrixBase<DerivedIn> &data, + const MatrixBase<DerivedOutI> &samples, const MatrixBase<DerivedOutV> &weights, + double learning_rate, double momentum, double L2_reg) + { + + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(samples, weights, + output_layer_node.bProp_matrix); + stop_timer(7); + + + start_timer(8); + output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + samples, weights, + learning_rate, momentum); + stop_timer(8); + + bPropRest(data, learning_rate, momentum, L2_reg); + } + +private: + template <typename DerivedIn> + void bPropRest(const MatrixBase<DerivedIn> &data, + double learning_rate, double momentum, double L2_reg) + { + // Second hidden layer + + start_timer(9); + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + + second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.bProp_matrix); + stop_timer(9); + + start_timer(10); + second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(10); + + // First hidden layer + + start_timer(11); + first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(11); + + start_timer(12); + first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, momentum, L2_reg); + stop_timer(13); + + } +}; + +} // namespace nplm + +#endif diff --git a/src/python/nplm.pxd b/src/python/nplm.pxd new file mode 100644 index 0000000..5cbead7 --- /dev/null +++ b/src/python/nplm.pxd @@ -0,0 +1,23 @@ +from libcpp.string cimport string +from libcpp.vector cimport vector + +cdef extern from "neuralLM.h": + cdef cppclass c_neuralLM "nplm::neuralLM": + c_neuralLM() + void set_normalization(bint) + void set_map_digits(char) + void set_log_base(double) + void read(string filename) except + + int get_order() + int lookup_word(string) + float lookup_ngram(vector[int]) + float lookup_ngram(int *, int) + void set_cache(int) + double cache_hit_rate() + +cdef class NeuralLM: + cdef c_neuralLM *thisptr + cdef int c_lookup_word(self, char *s) + cdef float c_lookup_ngram(self, int *words, int n) + cdef readonly int order + diff --git a/src/python/nplm.pyx b/src/python/nplm.pyx new file mode 100644 index 0000000..290d56c --- /dev/null +++ b/src/python/nplm.pyx @@ -0,0 +1,38 @@ +# distutils: language = c++ + +cdef class NeuralLM: + def __cinit__(self, normalization=False, map_digits=None, cache_size=0): + self.thisptr = new c_neuralLM() + self.thisptr.set_normalization(normalization) + self.thisptr.set_log_base(10.) + if type(map_digits) is str and len(map_digits) == 1: + self.thisptr.set_map_digits(map_digits) + if cache_size: + self.thisptr.set_cache(cache_size) + + def read(self, filename): + self.thisptr.read(filename) + self.order = self.thisptr.get_order() + + def get_order(self): + return self.thisptr.get_order() + + def lookup_word(self, s): + return self.thisptr.lookup_word(s) + + def lookup_ngram(self, words): + if len(words) == 0: + raise ValueError("ngram is empty") + return self.thisptr.lookup_ngram(words) + + def cache_hit_rate(self): + return self.thisptr.cache_hit_rate() + + # low-level interface that can be called by other Cython modules + cdef int c_lookup_word(self, char *s): + cdef string ss + ss.assign(s) + return self.thisptr.lookup_word(ss) + + cdef float c_lookup_ngram(self, int *words, int n): + return self.thisptr.lookup_ngram(words, n) diff --git a/src/shared/.gitignore b/src/shared/.gitignore new file mode 100755 index 0000000..e69de29 --- /dev/null +++ b/src/shared/.gitignore diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp new file mode 100644 index 0000000..9ab3766 --- /dev/null +++ b/src/testNeuralLM.cpp @@ -0,0 +1,164 @@ +#include <algorithm> +#include <fstream> + +#include <boost/algorithm/string/join.hpp> +#include <tclap/CmdLine.h> + +#include <Eigen/Core> +#include <Eigen/Dense> + +#include "param.h" + +#include "neuralLM.h" + +using namespace std; +using namespace boost; +using namespace TCLAP; +using namespace Eigen; + +using namespace nplm; + +int main (int argc, char *argv[]) +{ + param myParam; + bool normalization; + bool numberize, ngramize, add_start_stop; + + try { + // program options // + CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1"); + + ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); + ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd); + + ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); + + ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); + + ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd); + + ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd); + + cmd.parse(argc, argv); + + myParam.model_file = arg_model_file.getValue(); + myParam.test_file = arg_test_file.getValue(); + + normalization = arg_normalization.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); + + myParam.minibatch_size = minibatch_size.getValue(); + myParam.num_threads = num_threads.getValue(); + + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; + + const string sep(" Value: "); + cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl; + cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl; + + cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + + cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; + cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + myParam.num_threads = setup_threads(myParam.num_threads); + + ///// Create language model + + neuralLM lm; + lm.read(myParam.model_file); + lm.set_normalization(normalization); + lm.set_log_base(10); + lm.set_cache(1048576); + int ngram_size = lm.get_order(); + int minibatch_size = myParam.minibatch_size; + if (minibatch_size) + lm.set_width(minibatch_size); + + ///// Read test data + + double log_likelihood = 0.0; + + ifstream test_file(myParam.test_file.c_str()); + if (!test_file) + { + cerr << "error: could not open " << myParam.test_file << endl; + exit(1); + } + string line; + + vector<int> start; + vector<vector<int> > ngrams; + + while (getline(test_file, line)) + { + vector<string> words; + splitBySpace(line, words); + + vector<vector<int> > sent_ngrams; + preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize); + + start.push_back(ngrams.size()); + copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams)); + } + start.push_back(ngrams.size()); + + if (minibatch_size == 0) + { + // Score one n-gram at a time. This is how the LM would be queried from a decoder. + for (int sent_id=0; sent_id<start.size()-1; sent_id++) + { + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += lm.lookup_ngram(ngrams[j]); + cout << sent_log_prob << endl; + log_likelihood += sent_log_prob; + } + } + else + { + // Score a whole minibatch at a time. + Matrix<double,1,Dynamic> log_probs(ngrams.size()); + + Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); + minibatch.setZero(); + for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) + { + int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; + for (int j=0; j<current_minibatch_size; j++) + minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); + lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); + } + + for (int sent_id=0; sent_id<start.size()-1; sent_id++) + { + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += log_probs[j]; + cout << sent_log_prob << endl; + log_likelihood += sent_log_prob; + } + } + + cerr << "Test log10-likelihood: " << log_likelihood << endl; + #ifdef USE_CHRONO + cerr << "Propagation times:"; + for (int i=0; i<timer.size(); i++) + cerr << " " << timer.get(i); + cerr << endl; + #endif + +} diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp new file mode 100644 index 0000000..f20fff9 --- /dev/null +++ b/src/testNeuralNetwork.cpp @@ -0,0 +1,119 @@ +#include <tclap/CmdLine.h> +#include <boost/algorithm/string/join.hpp> +#include <boost/lexical_cast.hpp> +#include <fstream> + +#include "model.h" +#include "propagator.h" +#include "neuralClasses.h" +#include "param.h" +#include "util.h" + +using namespace std; +using namespace boost; +using namespace TCLAP; +using namespace Eigen; + +using namespace nplm; + +int main (int argc, char *argv[]) +{ + param myParam; + + try { + // program options // + CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1"); + + ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); + ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: 64.", false, 64, "int", cmd); + + ValueArg<string> arg_test_file("", "test_file", "Test file (one numberized example per line).", true, "", "string", cmd); + + ValueArg<string> arg_model_file("", "model_file", "Model file.", true, "", "string", cmd); + + cmd.parse(argc, argv); + + myParam.model_file = arg_model_file.getValue(); + myParam.test_file = arg_test_file.getValue(); + + myParam.num_threads = num_threads.getValue(); + myParam.minibatch_size = minibatch_size.getValue(); + + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; + + const string sep(" Value: "); + cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl; + cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl; + + cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + myParam.num_threads = setup_threads(myParam.num_threads); + + ///// Create network and propagator + + model nn; + nn.read(myParam.model_file); + myParam.ngram_size = nn.ngram_size; + propagator prop(nn, myParam.minibatch_size); + + ///// Set param values according to what was read in from model file + + myParam.ngram_size = nn.ngram_size; + myParam.input_vocab_size = nn.input_vocab_size; + myParam.output_vocab_size = nn.output_vocab_size; + myParam.num_hidden = nn.num_hidden; + myParam.input_embedding_dimension = nn.input_embedding_dimension; + myParam.output_embedding_dimension = nn.output_embedding_dimension; + + ///// Read test data + + vector<int> test_data_flat; + readDataFile(myParam.test_file, myParam.ngram_size, test_data_flat); + int test_data_size = test_data_flat.size() / myParam.ngram_size; + cerr << "Number of test instances: " << test_data_size << endl; + + Map< Matrix<int,Dynamic,Dynamic> > test_data(test_data_flat.data(), myParam.ngram_size, test_data_size); + + ///// Score test data + + int num_batches = (test_data_size-1)/myParam.minibatch_size + 1; + cerr<<"Number of test minibatches: "<<num_batches<<endl; + + double log_likelihood = 0.0; + + Matrix<double,Dynamic,Dynamic> scores(nn.output_vocab_size, myParam.minibatch_size); + Matrix<double,Dynamic,Dynamic> output_probs(nn.output_vocab_size, myParam.minibatch_size); + + for (int batch = 0; batch < num_batches; batch++) + { + int minibatch_start_index = myParam.minibatch_size * batch; + int current_minibatch_size = min(myParam.minibatch_size, + test_data_size - minibatch_start_index); + Matrix<int,Dynamic,Dynamic> minibatch = test_data.middleCols(minibatch_start_index, current_minibatch_size); + + prop.fProp(minibatch.topRows(myParam.ngram_size-1)); + + // Do full forward prop through output word embedding layer + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + + // And softmax and loss + double minibatch_log_likelihood; + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + minibatch.row(myParam.ngram_size-1), + output_probs, + minibatch_log_likelihood); + log_likelihood += minibatch_log_likelihood; + + /*for (int i=0; i<current_minibatch_size; i++) + cerr << minibatch.block(0,i,myParam.ngram_size,1) << " " << output_probs(minibatch(myParam.ngram_size-1,i),i) << endl;*/ + + } + cerr << "Test log-likelihood: " << log_likelihood << endl; +} diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp new file mode 100644 index 0000000..f45bc71 --- /dev/null +++ b/src/trainNeuralNetwork.cpp @@ -0,0 +1,584 @@ +#include <ctime> +#include <cmath> + +#include <iostream> +#include <fstream> +#include <vector> +#include <algorithm> + +#include <boost/unordered_map.hpp> +#include <boost/functional.hpp> +#include <boost/lexical_cast.hpp> +#include <boost/random/mersenne_twister.hpp> +#include <boost/algorithm/string/join.hpp> + +#include <Eigen/Dense> +#include <Eigen/Sparse> +#include "maybe_omp.h" +#include <tclap/CmdLine.h> + +#include "model.h" +#include "propagator.h" +#include "param.h" +#include "neuralClasses.h" +#include "graphClasses.h" +#include "util.h" +#include "multinomial.h" +//#include "gradientCheck.h" + +//#define EIGEN_DONT_PARALLELIZE + +using namespace std; +using namespace TCLAP; +using namespace Eigen; +using namespace boost; +using namespace boost::random; + +using namespace nplm; + +typedef unordered_map<Matrix<int,Dynamic,1>, double> vector_map; + +typedef long long int data_size_t; // training data can easily exceed 2G instances + +int main(int argc, char** argv) +{ + param myParam; + try { + // program options // + CmdLine cmd("Trains a two-layer neural probabilistic language model.", ' ' , "0.1"); + + // The options are printed in reverse order + + ValueArg<string> unigram_probs_file("", "unigram_probs_file", "Unigram model (deprecated and ignored)." , false, "", "string", cmd); + + ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); + + ValueArg<double> final_momentum("", "final_momentum", "Final value of momentum. Default: 0.9.", false, 0.9, "double", cmd); + ValueArg<double> initial_momentum("", "initial_momentum", "Initial value of momentum. Default: 0.9.", false, 0.9, "double", cmd); + ValueArg<bool> use_momentum("", "use_momentum", "Use momentum (hidden layer weights only). 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); + + ValueArg<double> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "double", cmd); + ValueArg<bool> normalization("", "normalization", "Learn individual normalization factors during training. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); + + ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 25.", false, 25, "int", cmd); + + ValueArg<double> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "double", cmd); + + ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 0.01.", false, 0.01, "double", cmd); + + ValueArg<int> validation_minibatch_size("", "validation_minibatch_size", "Minibatch size for validation. Default: 64.", false, 64, "int", cmd); + ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 64.", false, 64, "int", cmd); + + ValueArg<int> num_epochs("", "num_epochs", "Number of epochs. Default: 10.", false, 10, "int", cmd); + + ValueArg<double> init_range("", "init_range", "Maximum (of uniform) or standard deviation (of normal) for initialization. Default: 0.01", false, 0.01, "double", cmd); + ValueArg<bool> init_normal("", "init_normal", "Initialize parameters from a normal distribution. 1 = normal, 0 = uniform. Default: 0.", false, 0, "bool", cmd); + + ValueArg<string> loss_function("", "loss_function", "Loss function (log, nce). Default: nce.", false, "nce", "string", cmd); + ValueArg<string> activation_function("", "activation_function", "Activation function (identity, rectifier, tanh, hardtanh). Default: rectifier.", false, "rectifier", "string", cmd); + ValueArg<int> num_hidden("", "num_hidden", "Number of hidden nodes. Default: 100.", false, 100, "int", cmd); + + ValueArg<bool> share_embeddings("", "share_embeddings", "Share input and output embeddings. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); + ValueArg<int> output_embedding_dimension("", "output_embedding_dimension", "Number of output embedding dimensions. Default: 50.", false, 50, "int", cmd); + ValueArg<int> input_embedding_dimension("", "input_embedding_dimension", "Number of input embedding dimensions. Default: 50.", false, 50, "int", cmd); + ValueArg<int> embedding_dimension("", "embedding_dimension", "Number of input and output embedding dimensions. Default: none.", false, -1, "int", cmd); + + ValueArg<int> vocab_size("", "vocab_size", "Vocabulary size. Default: auto.", false, 0, "int", cmd); + ValueArg<int> input_vocab_size("", "input_vocab_size", "Vocabulary size. Default: auto.", false, 0, "int", cmd); + ValueArg<int> output_vocab_size("", "output_vocab_size", "Vocabulary size. Default: auto.", false, 0, "int", cmd); + ValueArg<int> ngram_size("", "ngram_size", "Size of n-grams. Default: auto.", false, 0, "int", cmd); + + ValueArg<string> model_prefix("", "model_prefix", "Prefix for output model files." , false, "", "string", cmd); + ValueArg<string> words_file("", "words_file", "Vocabulary." , false, "", "string", cmd); + ValueArg<string> input_words_file("", "input_words_file", "Vocabulary." , false, "", "string", cmd); + ValueArg<string> output_words_file("", "output_words_file", "Vocabulary." , false, "", "string", cmd); + ValueArg<string> validation_file("", "validation_file", "Validation data (one numberized example per line)." , false, "", "string", cmd); + ValueArg<string> train_file("", "train_file", "Training data (one numberized example per line)." , true, "", "string", cmd); + + cmd.parse(argc, argv); + + // define program parameters // + myParam.train_file = train_file.getValue(); + myParam.validation_file = validation_file.getValue(); + myParam.input_words_file = input_words_file.getValue(); + myParam.output_words_file = output_words_file.getValue(); + if (words_file.getValue() != "") + myParam.input_words_file = myParam.output_words_file = words_file.getValue(); + + myParam.model_prefix = model_prefix.getValue(); + + myParam.ngram_size = ngram_size.getValue(); + myParam.vocab_size = vocab_size.getValue(); + myParam.input_vocab_size = input_vocab_size.getValue(); + myParam.output_vocab_size = output_vocab_size.getValue(); + if (vocab_size.getValue() >= 0) + myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); + + myParam.num_hidden = num_hidden.getValue(); + myParam.activation_function = activation_function.getValue(); + myParam.loss_function = loss_function.getValue(); + + myParam.num_threads = num_threads.getValue(); + + myParam.num_noise_samples = num_noise_samples.getValue(); + + myParam.input_embedding_dimension = input_embedding_dimension.getValue(); + myParam.output_embedding_dimension = output_embedding_dimension.getValue(); + if (embedding_dimension.getValue() >= 0) + myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue(); + + myParam.minibatch_size = minibatch_size.getValue(); + myParam.validation_minibatch_size = validation_minibatch_size.getValue(); + myParam.num_epochs= num_epochs.getValue(); + myParam.learning_rate = learning_rate.getValue(); + myParam.use_momentum = use_momentum.getValue(); + myParam.share_embeddings = share_embeddings.getValue(); + myParam.normalization = normalization.getValue(); + myParam.initial_momentum = initial_momentum.getValue(); + myParam.final_momentum = final_momentum.getValue(); + myParam.L2_reg = L2_reg.getValue(); + myParam.init_normal= init_normal.getValue(); + myParam.init_range = init_range.getValue(); + myParam.normalization_init = normalization_init.getValue(); + + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; + + const string sep(" Value: "); + cerr << train_file.getDescription() << sep << train_file.getValue() << endl; + cerr << validation_file.getDescription() << sep << validation_file.getValue() << endl; + cerr << input_words_file.getDescription() << sep << input_words_file.getValue() << endl; + cerr << output_words_file.getDescription() << sep << output_words_file.getValue() << endl; + cerr << model_prefix.getDescription() << sep << model_prefix.getValue() << endl; + + cerr << ngram_size.getDescription() << sep << ngram_size.getValue() << endl; + cerr << input_vocab_size.getDescription() << sep << input_vocab_size.getValue() << endl; + cerr << output_vocab_size.getDescription() << sep << output_vocab_size.getValue() << endl; + + if (embedding_dimension.getValue() >= 0) + { + cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; + } + else + { + cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; + cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; + } + cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl; + if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue()) + { + cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; + exit(1); + } + + cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl; + + if (string_to_activation_function(activation_function.getValue()) == InvalidFunction) + { + cerr << "error: invalid activation function: " << activation_function.getValue() << endl; + exit(1); + } + cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl; + + if (string_to_loss_function(loss_function.getValue()) == InvalidLoss) + { + cerr << "error: invalid loss function: " << loss_function.getValue() << endl; + exit(1); + } + cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl; + + cerr << init_normal.getDescription() << sep << init_normal.getValue() << endl; + cerr << init_range.getDescription() << sep << init_range.getValue() << endl; + + cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl; + cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; + if (myParam.validation_file != "") + cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; + cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl; + cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl; + + cerr << num_noise_samples.getDescription() << sep << num_noise_samples.getValue() << endl; + + cerr << normalization.getDescription() << sep << normalization.getValue() << endl; + if (myParam.normalization) + cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; + + cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl; + if (myParam.use_momentum) + { + cerr << initial_momentum.getDescription() << sep << initial_momentum.getValue() << endl; + cerr << final_momentum.getDescription() << sep << final_momentum.getValue() << endl; + } + + cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; + + if (unigram_probs_file.getValue() != "") + { + cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; + } + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + myParam.num_threads = setup_threads(myParam.num_threads); + int save_threads; + + //unsigned seed = std::time(0); + unsigned seed = 1234; //for testing only + mt19937 rng(seed); + + /////////////////////////READING IN THE TRAINING AND VALIDATION DATA/////////////////// + ///////////////////////////////////////////////////////////////////////////////////// + + // Read training data + vector<int> training_data_flat; + readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size); + data_size_t training_data_size = training_data_flat.size() / myParam.ngram_size; + cerr << "Number of training instances: "<< training_data_size << endl; + + Map< Matrix<int,Dynamic,Dynamic> > training_data(training_data_flat.data(), myParam.ngram_size, training_data_size); + + // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index + if (myParam.input_vocab_size == 0 and myParam.input_words_file == "") + { + myParam.input_vocab_size = training_data.topRows(myParam.ngram_size-1).maxCoeff()+1; + } + + // If neither --output_vocab_size nor --output_words_file is given, set output_vocab_size to the maximum word index + if (myParam.output_vocab_size == 0 and myParam.words_file == "") + { + myParam.output_vocab_size = training_data.row(myParam.ngram_size-1).maxCoeff()+1; + } + + // Randomly shuffle training data to improve learning + for (data_size_t i=training_data_size-1; i>0; i--) + { + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + training_data.col(i).swap(training_data.col(j)); + } + + // Read validation data + vector<int> validation_data_flat; + int validation_data_size = 0; + + if (myParam.validation_file != "") + { + readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat); + validation_data_size = validation_data_flat.size() / myParam.ngram_size; + cerr << "Number of validation instances: " << validation_data_size << endl; + } + + Map< Matrix<int,Dynamic,Dynamic> > validation_data(validation_data_flat.data(), myParam.ngram_size, validation_data_size); + + ///// Read in vocabulary file. We don't actually use it; it just gets reproduced in the output file + + vector<string> input_words; + if (myParam.input_words_file != "") + { + readWordsFile(myParam.input_words_file, input_words); + if (myParam.input_vocab_size == 0) + myParam.input_vocab_size = input_words.size(); + } + + vector<string> output_words; + if (myParam.output_words_file != "") + { + readWordsFile(myParam.output_words_file, output_words); + if (myParam.output_vocab_size == 0) + myParam.output_vocab_size = output_words.size(); + } + + ///// Construct unigram model and sampler that will be used for NCE + + vector<data_size_t> unigram_counts(myParam.output_vocab_size); + for (data_size_t train_id=0; train_id < training_data_size; train_id++) + { + int output_word = training_data(myParam.ngram_size-1, train_id); + unigram_counts[output_word] += 1; + } + multinomial<data_size_t> unigram (unigram_counts); + + ///// Create and initialize the neural network and associated propagators. + + model nn(myParam.ngram_size, + myParam.input_vocab_size, + myParam.output_vocab_size, + myParam.input_embedding_dimension, + myParam.num_hidden, + myParam.output_embedding_dimension, + myParam.share_embeddings); + + nn.initialize(rng, myParam.init_normal, myParam.init_range, -log(myParam.output_vocab_size)); + nn.set_activation_function(string_to_activation_function(myParam.activation_function)); + loss_function_type loss_function = string_to_loss_function(myParam.loss_function); + + propagator prop(nn, myParam.minibatch_size); + propagator prop_validation(nn, myParam.validation_minibatch_size); + SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram); + // normalization parameters + vector_map c_h, c_h_running_gradient; + + ///////////////////////TRAINING THE NEURAL NETWORK//////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////// + + data_size_t num_batches = (training_data_size-1)/myParam.minibatch_size + 1; + cerr<<"Number of training minibatches: "<<num_batches<<endl; + + int num_validation_batches = 0; + if (validation_data_size > 0) + { + num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1; + cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl; + } + + double current_momentum = myParam.initial_momentum; + double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1); + double current_learning_rate = myParam.learning_rate; + double current_validation_ll = 0.0; + + int ngram_size = myParam.ngram_size; + int input_vocab_size = myParam.input_vocab_size; + int output_vocab_size = myParam.output_vocab_size; + int minibatch_size = myParam.minibatch_size; + int validation_minibatch_size = myParam.validation_minibatch_size; + int num_noise_samples = myParam.num_noise_samples; + + if (myParam.normalization) + { + for (data_size_t i=0;i<training_data_size;i++) + { + Matrix<int,Dynamic,1> context = training_data.block(0,i,ngram_size-1,1); + if (c_h.find(context) == c_h.end()) + { + c_h[context] = -myParam.normalization_init; + } + } + } + + for (int epoch=0; epoch<myParam.num_epochs; epoch++) + { + cerr << "Epoch " << epoch+1 << endl; + cerr << "Current learning rate: " << current_learning_rate << endl; + + if (myParam.use_momentum) + cerr << "Current momentum: " << current_momentum << endl; + else + current_momentum = -1; + + cerr << "Training minibatches: "; + + double log_likelihood = 0.0; + + int num_samples = 0; + if (loss_function == LogLoss) + num_samples = output_vocab_size; + else if (loss_function == NCELoss) + num_samples = 1+num_noise_samples; + + Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); + Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size); + + for(data_size_t batch=0;batch<num_batches;batch++) + { + if (batch > 0 && batch % 10000 == 0) + { + cerr << batch <<"..."; + } + + data_size_t minibatch_start_index = minibatch_size * batch; + int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index); + Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); + + double adjusted_learning_rate = current_learning_rate/current_minibatch_size; + //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl; + + /* + if (batch == rand() % num_batches) + { + cerr<<"we are checking the gradient in batch "<<batch<<endl; + /////////////////////////CHECKING GRADIENTS//////////////////////////////////////// + gradientChecking(myParam,minibatch_start_index,current_minibatch_size,word_nodes,context_nodes,hidden_layer_node,hidden_layer_to_output_node, + shuffled_training_data,c_h,unif_real_vector,eng_real_vector,unif_int_vector,eng_int_vector,unigram_probs_vector, + q_vector,J_vector,D_prime); + } + */ + + ///// Forward propagation + + prop.fProp(minibatch.topRows(ngram_size-1)); + + if (loss_function == NCELoss) + { + ///// Noise-contrastive estimation + + // Generate noise samples. Gather positive and negative samples into matrix. + + start_timer(3); + + minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1); + + for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++) + for (int train_id = 0; train_id < current_minibatch_size; train_id++) + minibatch_samples(sample_id, train_id) = unigram.sample(rng); + + stop_timer(3); + + // Final forward propagation step (sparse) + start_timer(4); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, + minibatch_samples, scores); + stop_timer(4); + + // Apply normalization parameters + if (myParam.normalization) + { + for (int train_id = 0;train_id < current_minibatch_size;train_id++) + { + Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1); + scores.col(train_id).array() += c_h[context]; + } + } + + double minibatch_log_likelihood; + start_timer(5); + softmax_loss.fProp(scores.leftCols(current_minibatch_size), + minibatch_samples, + probs, minibatch_log_likelihood); + stop_timer(5); + log_likelihood += minibatch_log_likelihood; + + ///// Backward propagation + + start_timer(6); + softmax_loss.bProp(probs, minibatch_weights); + stop_timer(6); + + // Update the normalization parameters + + if (myParam.normalization) + { + for (int train_id = 0;train_id < current_minibatch_size;train_id++) + { + Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1); + c_h[context] += adjusted_learning_rate * minibatch_weights.col(train_id).sum(); + } + } + + // Be careful of short minibatch + prop.bProp(minibatch.topRows(ngram_size-1), + minibatch_samples.leftCols(current_minibatch_size), + minibatch_weights.leftCols(current_minibatch_size), + adjusted_learning_rate, current_momentum, myParam.L2_reg); + } + else if (loss_function == LogLoss) + { + ///// Standard log-likelihood + start_timer(4); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + stop_timer(4); + + double minibatch_log_likelihood; + start_timer(5); + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + minibatch.row(ngram_size-1), + probs, + minibatch_log_likelihood); + stop_timer(5); + log_likelihood += minibatch_log_likelihood; + + ///// Backward propagation + + start_timer(6); + SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), + probs.leftCols(current_minibatch_size), + minibatch_weights); + stop_timer(6); + + prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size), + minibatch_weights, + adjusted_learning_rate, current_momentum, myParam.L2_reg); + } + } + cerr << "done." << endl; + + if (loss_function == LogLoss) + { + cerr << "Training log-likelihood: " << log_likelihood << endl; + cerr << " perplexity: "<< exp(-log_likelihood/training_data_size) << endl; + } + else if (loss_function == NCELoss) + cerr << "Training NCE log-likelihood: " << log_likelihood << endl; + + current_momentum += momentum_delta; + + #ifdef USE_CHRONO + cerr << "Propagation times:"; + for (int i=0; i<timer.size(); i++) + cerr << " " << timer.get(i); + cerr << endl; + #endif + + if (myParam.model_prefix != "") + { + cerr << "Writing model" << endl; + if (myParam.input_words_file != "") + nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words); + else + nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1)); + } + + if (epoch % 1 == 0 && validation_data_size > 0) + { + //////COMPUTING VALIDATION SET PERPLEXITY/////////////////////// + //////////////////////////////////////////////////////////////// + + double log_likelihood = 0.0; + + Matrix<double,Dynamic,Dynamic> scores(output_vocab_size, validation_minibatch_size); + Matrix<double,Dynamic,Dynamic> output_probs(output_vocab_size, validation_minibatch_size); + Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, validation_minibatch_size); + + for (int validation_batch =0;validation_batch < num_validation_batches;validation_batch++) + { + int validation_minibatch_start_index = validation_minibatch_size * validation_batch; + int current_minibatch_size = min(validation_minibatch_size, + validation_data_size - validation_minibatch_start_index); + minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, + current_minibatch_size); + prop_validation.fProp(minibatch.topRows(ngram_size-1)); + + // Do full forward prop through output word embedding layer + start_timer(4); + prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores); + stop_timer(4); + + // And softmax and loss. Be careful of short minibatch + double minibatch_log_likelihood; + start_timer(5); + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + minibatch.row(ngram_size-1), + output_probs, + minibatch_log_likelihood); + stop_timer(5); + log_likelihood += minibatch_log_likelihood; + } + + cerr << "Validation log-likelihood: "<< log_likelihood << endl; + cerr << " perplexity: "<< exp(-log_likelihood/validation_data_size) << endl; + + // If the validation perplexity decreases, halve the learning rate. + if (epoch > 0 && log_likelihood < current_validation_ll) + { + current_learning_rate /= 2; + } + current_validation_ll = log_likelihood; + } + + } + return 0; +} diff --git a/src/util.cpp b/src/util.cpp new file mode 100644 index 0000000..fe022c9 --- /dev/null +++ b/src/util.cpp @@ -0,0 +1,213 @@ +#include <iostream> +#include <fstream> +#include <iomanip> +#include <cmath> + +#include <boost/unordered_map.hpp> +#include <boost/algorithm/string.hpp> + +#include "maybe_omp.h" +#ifdef EIGEN_USE_MKL_ALL +#include <mkl.h> +#endif + +#include "util.h" + +extern double drand48(); + +using namespace Eigen; +using namespace std; +using namespace boost::random; + +namespace nplm +{ + +void splitBySpace(const std::string &line, std::vector<std::string> &items) +{ + string copy(line); + boost::trim_if(copy, boost::is_any_of(" \t")); + if (copy == "") + { + items.clear(); + return; + } + boost::split(items, copy, boost::is_any_of(" \t"), boost::token_compress_on); +} + +void readWordsFile(ifstream &TRAININ, vector<string> &word_list) +{ + string line; + while (getline(TRAININ, line) && line != "") + { + vector<string> words; + splitBySpace(line, words); + if (words.size() != 1) + { + cerr << "Error: vocabulary file must have only one word per line" << endl; + exit(-1); + } + word_list.push_back(words[0]); + } +} + +void readWordsFile(const string &file, vector<string> &word_list) +{ + cerr << "Reading word list from: " << file<< endl; + + ifstream TRAININ; + TRAININ.open(file.c_str()); + if (! TRAININ) + { + cerr << "Error: can't read word list from file " << file<< endl; + exit(-1); + } + + readWordsFile(TRAININ, word_list); + TRAININ.close(); +} + +void writeWordsFile(const vector<string> &words, ofstream &file) +{ + for (int i=0; i<words.size(); i++) + { + file << words[i] << endl; + } +} + +void writeWordsFile(const vector<string> &words, const string &filename) +{ + ofstream OUT; + OUT.open(filename.c_str()); + if (! OUT) + { + cerr << "Error: can't write to file " << filename << endl; + exit(-1); + } + writeWordsFile(words, OUT); + OUT.close(); +} + +void readSentFile(const string &file, vector<vector<string> > &sentences) +{ + cerr << "Reading sentences from: " << file << endl; + + ifstream TRAININ; + TRAININ.open(file.c_str()); + if (! TRAININ) + { + cerr << "Error: can't read from file " << file<< endl; + exit(-1); + } + + string line; + while (getline(TRAININ, line)) + { + vector<string> words; + splitBySpace(line, words); + sentences.push_back(words); + } + + TRAININ.close(); +} + +// Read a data file of unknown size into a flat vector<int>. +// If this takes too much memory, we should create a vector of minibatches. +void readDataFile(const string &filename, int &ngram_size, vector<int> &data, int minibatch_size) +{ + cerr << "Reading minibatches from file " << filename << ": "; + + ifstream DATAIN(filename.c_str()); + if (!DATAIN) + { + cerr << "Error: can't read data from file " << filename<< endl; + exit(-1); + } + + vector<int> data_vector; + + string line; + long long int n_lines = 0; + while (getline(DATAIN, line)) + { + vector<string> ngram; + splitBySpace(line, ngram); + + if (ngram_size == 0) + ngram_size = ngram.size(); + + if (ngram.size() != ngram_size) + { + cerr << "Error: expected " << ngram_size << " fields in instance, found " << ngram.size() << endl; + exit(-1); + } + + for (int i=0;i<ngram_size;i++) + data.push_back(boost::lexical_cast<int>(ngram[i])); + + n_lines++; + if (minibatch_size && n_lines % (minibatch_size * 10000) == 0) + cerr << n_lines/minibatch_size << "..."; + } + cerr << "done." << endl; + DATAIN.close(); +} + +double logadd(double x, double y) +{ + if (x > y) + return x + log1p(std::exp(y-x)); + else + return y + log1p(std::exp(x-y)); +} + +#ifdef USE_CHRONO +void Timer::start(int i) +{ + m_start[i] = clock_type::now(); +} + +void Timer::stop(int i) +{ + m_total[i] += clock_type::now() - m_start[i]; +} + +void Timer::reset(int i) { m_total[i] = duration_type(); } + +double Timer::get(int i) const +{ + return boost::chrono::duration<double>(m_total[i]).count(); +} + +Timer timer(20); +#endif + +int setup_threads(int n_threads) +{ + #ifdef _OPENMP + if (n_threads) + omp_set_num_threads(n_threads); + n_threads = omp_get_max_threads(); + if (n_threads > 1) + cerr << "Using " << n_threads << " threads" << endl; + + Eigen::initParallel(); + Eigen::setNbThreads(n_threads); + + #ifdef __INTEL_MKL__ + /* + // Set the threading layer to match the compiler. + // This lets MKL automatically go single-threaded in parallel regions. + #ifdef __INTEL_COMPILER + mkl_set_threading_layer(MKL_THREADING_INTEL); + #elif defined __GNUC__ + mkl_set_threading_layer(MKL_THREADING_GNU); + #endif + */ + mkl_set_num_threads(n_threads); + #endif + #endif + + return n_threads; +} + +} // namespace nplm diff --git a/src/util.h b/src/util.h new file mode 100644 index 0000000..c774a72 --- /dev/null +++ b/src/util.h @@ -0,0 +1,219 @@ +#pragma once + +#include <iostream> +#include <fstream> +#include <sstream> +#include <vector> +#include <string> + +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_real_distribution.hpp> +#include <boost/random/normal_distribution.hpp> +#include <boost/lexical_cast.hpp> +#include <boost/functional/hash.hpp> +#ifdef USE_CHRONO +#include <boost/chrono.hpp> +#endif + +#include <Eigen/Dense> + +#include "maybe_omp.h" + +// Make matrices hashable + +namespace Eigen { + template <typename Derived> + size_t hash_value(const DenseBase<Derived> &m) + { + size_t h=0; + for (int i=0; i<m.rows(); i++) + for (int j=0; j<m.cols(); j++) + boost::hash_combine(h, m(i,j)); + return h; + } +} + +namespace nplm +{ + +void splitBySpace(const std::string &line, std::vector<std::string> &items); +void readWordsFile(std::ifstream &TRAININ, std::vector<std::string> &word_list); +void readWordsFile(const std::string &file, std::vector<std::string> &word_list); +void writeWordsFile(const std::vector<std::string> &words, std::ofstream &file); +void writeWordsFile(const std::vector<std::string> &words, const std::string &filename); +void readDataFile(const std::string &filename, int &ngram_size, std::vector<int> &data, int minibatch_size=0); +void readUnigramProbs(const std::string &unigram_probs_file, std::vector<double> &unigram_probs); +void readSentFile(const std::string &file, std::vector<std::vector<std::string> > &sentences); + +// Functions that take non-const matrices as arguments +// are supposed to declare them const and then use this +// to cast away constness. +#define UNCONST(t,c,uc) Eigen::MatrixBase<t> &uc = const_cast<Eigen::MatrixBase<t>&>(c); + +template <typename Derived> +void initMatrix(boost::random::mt19937 &engine, + const Eigen::MatrixBase<Derived> &p_const, + bool init_normal, double range) +{ + UNCONST(Derived, p_const, p); + if (init_normal == 0) + // initialize with uniform distribution in [-range, range] + { + boost::random::uniform_real_distribution<> unif_real(-range, range); + for (int i = 0; i < p.rows(); i++) + { + for (int j = 0; j< p.cols(); j++) + { + p(i,j) = unif_real(engine); + } + } + + } + else + // initialize with gaussian distribution with mean 0 and stdev range + { + boost::random::normal_distribution<double> unif_normal(0., range); + for (int i = 0; i < p.rows(); i++) + { + for (int j = 0; j < p.cols(); j++) + { + p(i,j) = unif_normal(engine); + } + } + } +} + +template <typename Derived> +void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> ¶m_const) +{ + UNCONST(Derived, param_const, param); + + int i = 0; + std::string line; + std::vector<std::string> fields; + + while (std::getline(TRAININ, line) && line != "") + { + splitBySpace(line, fields); + if (fields.size() != param.cols()) + { + std::ostringstream err; + err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")"; + throw std::runtime_error(err.str()); + } + + if (i >= param.rows()) + { + std::ostringstream err; + err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")"; + throw std::runtime_error(err.str()); + } + + for (int j=0; j<fields.size(); j++) + { + param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]); + } + i++; + } + + if (i != param.rows()) + { + std::ostringstream err; + err << "error: wrong number of rows (expected " << param.rows() << ", found more)"; + throw std::runtime_error(err.str()); + } +} + +template <typename Derived> +void readMatrix(const std::string ¶m_file, const Eigen::MatrixBase<Derived> ¶m_const) +{ + UNCONST(Derived, param_const, param); + std::cerr << "Reading data from file: " << param_file << std::endl; + + std::ifstream TRAININ(param_file.c_str()); + if (!TRAININ) + { + std::cerr << "Error: can't read training data from file " << param_file << std::endl; + exit(-1); + } + readMatrix(TRAININ, param); + TRAININ.close(); +} + +template <typename Derived> +void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, const std::string &filename) +{ + std::cerr << "Writing parameters to " << filename << std::endl; + + std::ofstream OUT; + OUT.precision(16); + OUT.open(filename.c_str()); + if (! OUT) + { + std::cerr << "Error: can't write to file " << filename<< std::endl; + exit(-1); + } + writeMatrix(param, OUT); + OUT.close(); +} + +template <typename Derived> +void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, std::ofstream &OUT) +{ + for (int row = 0;row < param.rows();row++) + { + int col; + for (col = 0;col < param.cols()-1;col++) + { + OUT<<param(row,col)<<"\t"; + } + //dont want an extra tab at the end + OUT<<param(row,col)<<std::endl; + } +} + +template <typename Derived> +double logsum(const Eigen::MatrixBase<Derived> &v) +{ + int mi; + double m = v.maxCoeff(&mi); + double logz = 0.0; + for (int i=0; i<v.rows(); i++) + if (i != mi) + logz += std::exp(v(i) - m); + logz = log1p(logz) + m; + return logz; +} + +double logadd(double x, double y); + +#ifdef USE_CHRONO +class Timer +{ + typedef boost::chrono::high_resolution_clock clock_type; + typedef clock_type::time_point time_type; + typedef clock_type::duration duration_type; + std::vector<time_type> m_start; + std::vector<duration_type> m_total; +public: + Timer() { } + Timer(int n) { resize(n); } + void resize(int n) { m_start.resize(n); m_total.resize(n); } + int size() const { return m_start.size(); } + void start(int i); + void stop(int i); + void reset(int i); + double get(int i) const; +}; + +extern Timer timer; +#define start_timer(x) timer.start(x) +#define stop_timer(x) timer.stop(x) +#else +#define start_timer(x) 0 +#define stop_timer(x) 0 +#endif + +int setup_threads(int n_threads); + +} // namespace nplm diff --git a/src/vocabulary.h b/src/vocabulary.h new file mode 100644 index 0000000..fee76f6 --- /dev/null +++ b/src/vocabulary.h @@ -0,0 +1,84 @@ +#ifndef VOCABULARY_H +#define VOCABULARY_H + +#include <vector> +#include <string> +#include <queue> +#include <boost/unordered_map.hpp> + +namespace nplm +{ + +template <typename T> +struct compare_second +{ + bool operator()(const T &lhs, const T &rhs) const { return lhs.second < rhs.second; } +}; + +class vocabulary { + std::vector<std::string> m_words; + boost::unordered_map<std::string, int> m_index; + int unk; +public: + vocabulary() + { + unk = insert_word("<unk>"); + } + + vocabulary(const std::vector<std::string> &words) + : + m_words(words) + { + for (int i=0; i<words.size(); i++) + m_index[words[i]] = i; + unk = m_index["<unk>"]; + } + + int lookup_word(const std::string &word) const + { + boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word); + if (pos != m_index.end()) + return pos->second; + else + return unk; + } + + int insert_word(const std::string &word) + { + int i = size(); + bool inserted = m_index.insert(make_pair(word, i)).second; + if (inserted) + { + m_words.push_back(word); + } + return i; + } + + int size() const { return m_words.size(); } + + // Inserts the most-frequent words from counts until vocab_size words are reached. + // counts is a collection of pair<string,int> + template <typename Map> + int insert_most_frequent(const Map &counts, int vocab_size) + { + typedef std::pair<std::string,int> stringint; + + std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> > + q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end())); + + int inserted = 0; + while (size() < vocab_size && !q.empty()) + { + insert_word(q.top().first); + q.pop(); + inserted++; + } + return inserted; + } + + const std::vector<std::string> &words() const { return m_words; } +}; + +} // namespace nplm + +#endif |