diff options
author | Rico Sennrich <rico.sennrich@gmx.ch> | 2014-11-17 13:39:49 +0300 |
---|---|---|
committer | Rico Sennrich <rico.sennrich@gmx.ch> | 2014-11-17 13:51:09 +0300 |
commit | 7eb6ea415c1a10d27d36182bf00c01d05e137325 (patch) | |
tree | 4d927fdb05a4f106ad6d921b7227062208610e2f /src | |
parent | ba48d701c70e03fe1c1e96ecf5e06591ad4d3e27 (diff) |
upstream 0.3 (reverting all changes from this fork)
Diffstat (limited to 'src')
-rw-r--r-- | src/Activation_function.h | 9 | ||||
-rw-r--r-- | src/Makefile | 92 | ||||
-rw-r--r-- | src/SoftmaxLoss.h | 4 | ||||
-rw-r--r-- | src/USCMatrix.h | 2 | ||||
-rw-r--r-- | src/clipper.h | 16 | ||||
-rw-r--r-- | src/graphClasses.h | 2 | ||||
-rw-r--r-- | src/model.cpp | 102 | ||||
-rw-r--r-- | src/model.h | 9 | ||||
-rw-r--r-- | src/multinomial.h | 5 | ||||
-rw-r--r-- | src/neuralClasses.h | 766 | ||||
-rw-r--r-- | src/neuralLM.h | 278 | ||||
-rw-r--r-- | src/neuralNetwork.h | 188 | ||||
-rw-r--r-- | src/neuralTM.cpp | 1 | ||||
-rw-r--r-- | src/neuralTM.h | 133 | ||||
-rw-r--r-- | src/param.h | 8 | ||||
-rw-r--r-- | src/prepareNeuralLM.cpp | 491 | ||||
-rw-r--r-- | src/prepareNeuralTM.cpp | 243 | ||||
-rw-r--r-- | src/propagator.h | 303 | ||||
-rw-r--r-- | src/python/nplm.pyx | 2 | ||||
-rw-r--r-- | src/python/nptm.pxd | 25 | ||||
-rw-r--r-- | src/python/nptm.pyx | 46 | ||||
-rw-r--r-- | src/testNeuralLM.cpp | 110 | ||||
-rw-r--r-- | src/testNeuralNetwork.cpp | 7 | ||||
-rw-r--r-- | src/trainNeuralNetwork.cpp | 508 | ||||
-rw-r--r-- | src/types.hpp | 41 | ||||
-rw-r--r-- | src/util.cpp | 43 | ||||
-rw-r--r-- | src/util.h | 64 | ||||
-rw-r--r-- | src/vocabulary.h | 15 |
28 files changed, 2500 insertions, 1013 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h index 0264cd1..dcd8651 100644 --- a/src/Activation_function.h +++ b/src/Activation_function.h @@ -3,7 +3,7 @@ #include <cmath> #include <string> -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> #include "util.h" @@ -68,7 +68,6 @@ struct drectifier_functor { class Activation_function { - private: int size; activation_function_type f; @@ -99,8 +98,10 @@ class Activation_function } template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut> - void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output, - const MatrixBase<DerivedIn> &finput, const MatrixBase<DerivedOut> &foutput) const + void bProp(const MatrixBase<DerivedGOut> &input, + MatrixBase<DerivedGIn> &output, + const MatrixBase<DerivedIn> &finput, + const MatrixBase<DerivedOut> &foutput) const { UNCONST(DerivedGIn, output, my_output); diff --git a/src/Makefile b/src/Makefile index 1da279c..9e8f1b7 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,8 @@ ### Compilation options. # C++ compiler. Tested with g++ and Intel icpc. -CXX=g++ +#CXX=/usr/bin/g++ +CXX=/opt/local/bin/g++-mp-4.7 #CXX=icpc # Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance! @@ -19,22 +20,26 @@ OS:=$(shell uname -s) ### Required libraries. You must install these prior to building. # Set this to the root directory of Boost (should have a subdirectory named boost): -BOOST=/home/hieu/workspace/boost/boost_1_55_0.gcc + +#BOOST=/usr/usc/boost/1.51.0 #BOOST=/usr -#BOOST=/opt/local +BOOST=/opt/local # Where to find Boost header files BOOST_INC=$(BOOST)/include # Set this to the root directory of Eigen (should have a subdirectory named Eigen): -EIGEN=/home/hieu/workspace/eigen-3 +EIGEN=../3rdparty ### Optional libraries. # To disable multithreading, comment out the line below: -#OMP=1 +OMP=1 # To use the MKL library, uncomment the line below and set it to the MKL root: #MKL=/usr/usc/intel/12.1.1/mkl +# Set to 1 if you want to use the Single Dynamic Library; comment out otherwise. +# This is required for building the Python extensions, but doesn't work with building a static binary. +MKL_SINGLE=1 # For Python bindings, set the following and run 'make python/nplm.so'. PYTHON_VERSION=2.7 @@ -53,11 +58,13 @@ TCLAP=../3rdparty/tclap/include # Where to find Boost libraries BOOST_LIB=$(BOOST)/lib # On some systems, a suffix is appended for the multithreaded version. -BOOST_LIB_SUFFIX= -#BOOST_LIB_SUFFIX=-mt +#BOOST_LIB_SUFFIX= +BOOST_LIB_SUFFIX=-mt BOOST_CFLAGS=-I$(BOOST_INC) BOOST_LDFLAGS= +BOOST_LDLIBS=-lboost_iostreams$(BOOST_LIB_SUFFIX) -lboost_system$(BOOST_LIB_SUFFIX) -lboost_filesystem$(BOOST_LIB_SUFFIX) +#BOOST_LDLIBS=-lboost_system$(BOOST_LIB_SUFFIX) -lboost_thread$(BOOST_LIB_SUFFIX) ifdef USE_CHRONO BOOST_CFLAGS+=-DUSE_CHRONO BOOST_LDLIBS+=-lboost_system$(BOOST_LIB_SUFFIX) -lboost_chrono$(BOOST_LIB_SUFFIX) @@ -78,28 +85,41 @@ ifdef OMP endif ifdef MKL - MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL - MKL_LDLIBS=-Wl,--start-group - ifeq ($(ARCH),x86_64) - MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64 - MKL_LDLIBS+=-lmkl_intel_lp64 - endif - ifeq ($(ARCH),i686) - MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32 - MKL_LDLIBS+=-lmkl_intel - endif - - ifneq (,$(findstring g++,$(CXX))) - MKL_LDLIBS+=-lmkl_gnu_thread - endif - ifneq (,$(findstring icpc,$(CXX))) - MKL_LDLIBS+=-lmkl_intel_thread - endif - - #MKL_LDLIBS=-lmkl_rt - MKL_LDLIBS+=-lmkl_core -Wl,--end-group + ifdef MKL_SINGLE + ifeq ($(ARCH),x86_64) + MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64 + endif + ifeq ($(ARCH),i686) + MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32 + endif + MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL -DMKL_SINGLE + MKL_LDLIBS=-lmkl_rt + + else + + MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL + MKL_LDLIBS=-Wl,--start-group + ifeq ($(ARCH),x86_64) + MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64 + MKL_LDLIBS+=-lmkl_intel_lp64 + endif + ifeq ($(ARCH),i686) + MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32 + MKL_LDLIBS+=-lmkl_intel + endif + + ifneq (,$(findstring g++,$(CXX))) + MKL_LDLIBS+=-lmkl_gnu_thread + endif + ifneq (,$(findstring icpc,$(CXX))) + MKL_LDLIBS+=-lmkl_intel_thread + endif + + MKL_LDLIBS+=-lmkl_core -Wl,--end-group +endif endif + ifdef STATIC LDFLAGS+=-static endif @@ -122,13 +142,13 @@ RANLIB=ranlib # Rules BINS=trainNeuralNetwork testNeuralNetwork prepareNeuralLM testNeuralLM prepareNeuralTM -LIBS=libneuralLM.a libneuralLM.so +LIBS=libnplm.a libnplm.so OBJS=util.o model.o all: $(BINS) $(LIBS) clean: - rm -f *.o shared/*.o python/*.o $(BINS) $(LIBS) python/nplm.{cpp,so} + rm -f *.o shared/*.o python/*.o $(BINS) $(LIBS) python/nplm.{cpp,so} python/nptm.{cpp,so} install: all mkdir -p ../bin @@ -157,19 +177,25 @@ testNeuralLM: testNeuralLM.o $(OBJS) prepareNeuralTM: prepareNeuralTM.o $(OBJS) $(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ -libneuralLM.a: neuralLM.o $(OBJS) +libnplm.a: neuralLM.o $(OBJS) rm -f $@ $(AR) rv $@ $^ $(RANLIB) $@ -libneuralLM.so: $(addprefix shared/,neuralLM.o $(OBJS)) +libnplm.so: $(addprefix shared/,neuralLM.o $(OBJS)) $(CXX) -shared $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@ -python/nplm.cpp: python/nplm.pyx - $(CYTHON) --cplus $^ +%.cpp: %.pyx + $(CYTHON) --cplus $^ -o $@ python/nplm.o: python/nplm.cpp $(CXX) -c -fPIC -I. $(ALL_CFLAGS) $(PYTHON_CFLAGS) $< -o $@ python/nplm.so: python/nplm.o $(addprefix shared/,neuralLM.o $(OBJS)) $(CXX) -shared $(ALL_LDFLAGS) $(PYTHON_LDFLAGS) $^ $(ALL_LDLIBS) $(PYTHON_LDLIBS) -o $@ + +python/nptm.o: python/nptm.cpp + $(CXX) -c -fPIC -I. $(ALL_CFLAGS) $(PYTHON_CFLAGS) $< -o $@ + +python/nptm.so: python/nptm.o $(addprefix shared/,neuralTM.o $(OBJS)) + $(CXX) -shared $(ALL_LDFLAGS) $(PYTHON_LDFLAGS) $^ $(ALL_LDLIBS) $(PYTHON_LDLIBS) -o $@ diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h index de5e043..24f59f5 100644 --- a/src/SoftmaxLoss.h +++ b/src/SoftmaxLoss.h @@ -1,7 +1,7 @@ -#ifndef SOFTMAXLOSS_H + #ifndef SOFTMAXLOSS_H #define SOFTMAXLOSS_H -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> #include "multinomial.h" #include "util.h" diff --git a/src/USCMatrix.h b/src/USCMatrix.h index 092bc4e..caa9553 100644 --- a/src/USCMatrix.h +++ b/src/USCMatrix.h @@ -1,7 +1,7 @@ #ifndef USCMATRIX_H #define USCMATRIX_H -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> #include "maybe_omp.h" #include "util.h" diff --git a/src/clipper.h b/src/clipper.h new file mode 100644 index 0000000..dda5c4d --- /dev/null +++ b/src/clipper.h @@ -0,0 +1,16 @@ +#ifndef CLIPPER_H +#define CLIPPER_H + +namespace nplm { + struct Clipper{ + double operator() (double x) const { + return std::min(0.5, std::max(x,-0.5)); + //return(x); + } +}; + +} + +#endif + + diff --git a/src/graphClasses.h b/src/graphClasses.h index 9f9e27c..da5f1af 100644 --- a/src/graphClasses.h +++ b/src/graphClasses.h @@ -3,7 +3,7 @@ #include <cstdlib> #include "neuralClasses.h" -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> namespace nplm { diff --git a/src/model.cpp b/src/model.cpp index 589a52e..262490f 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -12,26 +12,18 @@ using namespace boost::random; namespace nplm { - void model::resize(int ngram_size, - int input_vocab_size, - int output_vocab_size, - int input_embedding_dimension, - int num_hidden, - int output_embedding_dimension) +void model::resize(int ngram_size, + int input_vocab_size, + int output_vocab_size, + int input_embedding_dimension, + int num_hidden, + int output_embedding_dimension) { input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); - if (num_hidden == 0) { - first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(output_embedding_dimension); - second_hidden_linear.resize(1,1); - second_hidden_activation.resize(1); - } - else { - first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(num_hidden); - second_hidden_linear.resize(output_embedding_dimension, num_hidden); - second_hidden_activation.resize(output_embedding_dimension); - } + first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(num_hidden); + second_hidden_linear.resize(output_embedding_dimension, num_hidden); + second_hidden_activation.resize(output_embedding_dimension); output_layer.resize(output_vocab_size, output_embedding_dimension); this->ngram_size = ngram_size; this->input_vocab_size = input_vocab_size; @@ -42,12 +34,34 @@ namespace nplm premultiplied = false; } -void model::initialize(mt19937 &init_engine, bool init_normal, double init_range, double init_bias) +void model::initialize(mt19937 &init_engine, + bool init_normal, + double init_range, + double init_bias, + string ¶meter_update, + double adagrad_epsilon) { - input_layer.initialize(init_engine, init_normal, init_range); - output_layer.initialize(init_engine, init_normal, init_range, init_bias); - first_hidden_linear.initialize(init_engine, init_normal, init_range); - second_hidden_linear.initialize(init_engine, init_normal, init_range); + input_layer.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); + output_layer.initialize(init_engine, + init_normal, + init_range, + init_bias, + parameter_update, + adagrad_epsilon); + first_hidden_linear.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); + second_hidden_linear.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); } void model::premultiply() @@ -56,12 +70,7 @@ void model::premultiply() // we can multiply them into a single linear layer *if* we are not training int context_size = ngram_size-1; Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; - if (num_hidden == 0) { - first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); - } - else { - first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); - } + first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); for (int i=0; i<context_size; i++) first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); input_layer.W->resize(1,1); // try to save some memory @@ -133,6 +142,12 @@ void model::read(const string &filename) read(filename, input_words, output_words); } +void model::read(const string &filename, vector<string> &words) +{ + vector<string> output_words; + read(filename, words, output_words); +} + void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words) { ifstream file(filename.c_str()); @@ -170,9 +185,13 @@ void model::read(const string &filename, vector<string> &input_words, vector<str else if (line == "\\input_embeddings") input_layer.read(file); else if (line == "\\hidden_weights 1") - first_hidden_linear.read(file); + first_hidden_linear.read_weights(file); + else if (line == "\\hidden_biases 1") + first_hidden_linear.read_biases (file); else if (line == "\\hidden_weights 2") - second_hidden_linear.read(file); + second_hidden_linear.read_weights(file); + else if (line == "\\hidden_biases 2") + second_hidden_linear.read_biases (file); else if (line == "\\output_weights") output_layer.read_weights(file); else if (line == "\\output_biases") @@ -191,17 +210,22 @@ void model::read(const string &filename, vector<string> &input_words, vector<str file.close(); } - void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words) +void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words) { write(filename, &input_words, &output_words); } +void model::write(const string &filename, const vector<string> &words) +{ + write(filename, &words, NULL); +} + void model::write(const string &filename) { write(filename, NULL, NULL); } - void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords) +void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords) { ofstream file(filename.c_str()); if (!file) throw runtime_error("Could not open file " + filename); @@ -236,11 +260,19 @@ void model::write(const string &filename) file << endl; file << "\\hidden_weights 1" << endl; - first_hidden_linear.write(file); + first_hidden_linear.write_weights(file); file << endl; + + file << "\\hidden_biases 1" << endl; + first_hidden_linear.write_biases(file); + file <<endl; file << "\\hidden_weights 2" << endl; - second_hidden_linear.write(file); + second_hidden_linear.write_weights(file); + file << endl; + + file << "\\hidden_biases 2" << endl; + second_hidden_linear.write_biases(file); file << endl; file << "\\output_weights" << endl; diff --git a/src/model.h b/src/model.h index 271b22f..3cce06a 100644 --- a/src/model.h +++ b/src/model.h @@ -74,7 +74,10 @@ public: void initialize(boost::random::mt19937 &init_engine, bool init_normal, double init_range, - double init_bias); + double init_bias, + string ¶meter_udpate, + double adagrad_epsilon); + void set_activation_function(activation_function_type f) { activation_function = f; @@ -90,9 +93,11 @@ public: // a better solution is needed void read(const std::string &filename); + void read(const std::string &filename, std::vector<std::string> &words); void read(const std::string &filename, std::vector<std::string> &input_words, std::vector<std::string> &output_words); - void write(const std::string &filename, const std::vector<std::string> &input_words, const std::vector<std::string> &output_words); void write(const std::string &filename); + void write(const std::string &filename, const std::vector<std::string> &words); + void write(const std::string &filename, const std::vector<std::string> &input_words, const std::vector<std::string> &output_words); private: void readConfig(std::ifstream &config_file); diff --git a/src/multinomial.h b/src/multinomial.h index 1314fcb..8fccdf4 100644 --- a/src/multinomial.h +++ b/src/multinomial.h @@ -52,9 +52,9 @@ public: double p = unif_real(eng); int s; if (q[m] > p) - s = m; + s = m; else - s = J[m]; + s = J[m]; assert (s >= 0); return s; } @@ -125,6 +125,7 @@ private: { std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*l_it]-1) << std::endl; } + q[*l_it] = 1.0; } } diff --git a/src/neuralClasses.h b/src/neuralClasses.h index 1b57763..949e445 100644 --- a/src/neuralClasses.h +++ b/src/neuralClasses.h @@ -7,7 +7,7 @@ #include <vector> #include <boost/unordered_map.hpp> -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> #include "maybe_omp.h" #include "util.h" @@ -21,16 +21,26 @@ //#define EIGEN_DONT_PARALLELIZE //#define EIGEN_DEFAULT_TO_ROW_MAJOR +using namespace std; namespace nplm { // is this cheating? using Eigen::Matrix; +using Eigen::Array; using Eigen::MatrixBase; using Eigen::Dynamic; typedef boost::unordered_map<int,bool> int_map; +struct Clipper{ + double operator() (double x) const { + return std::min(0.5, std::max(x,-0.5)); + //return(x); + } +}; + + class Linear_layer { private: @@ -38,6 +48,13 @@ class Linear_layer Matrix<double,Dynamic,Dynamic> U_gradient; Matrix<double,Dynamic,Dynamic> U_velocity; Matrix<double,Dynamic,Dynamic> U_running_gradient; + Matrix<double,Dynamic,Dynamic> U_running_parameter_update; + // Biases + Matrix<double,Dynamic,1> b; + Matrix<double,Dynamic,1> b_velocity; + Matrix<double,Dynamic,1> b_running_gradient; + Matrix<double,Dynamic,1> b_running_parameter_update; + Matrix<double,Dynamic,1> b_gradient; friend class model; @@ -49,94 +66,222 @@ class Linear_layer { U.setZero(rows, cols); U_gradient.setZero(rows, cols); - U_running_gradient.setZero(rows, cols); - U_velocity.setZero(rows, cols); + //U_running_gradient.setZero(rows, cols); + //U_running_parameter_updates.setZero(rows, cols); + //U_velocity.setZero(rows, cols); + b.resize(rows); + b_gradient.setZero(rows); + //b_running_gradient.resize(rows); + //b_velocity.resize(rows); } - void read(std::ifstream &U_file) { readMatrix(U_file, U); } - void write(std::ofstream &U_file) { writeMatrix(U, U_file); } + void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } + void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } + void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } + void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } + template <typename Engine> - void initialize(Engine &engine, bool init_normal, double init_range) + void initialize(Engine &engine, + bool init_normal, + double init_range, + string ¶meter_update, + double adagrad_epsilon) { + if (parameter_update == "ADA") { + U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; + b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + } + if (parameter_update == "ADAD") { + U_running_gradient.setZero(U.rows(),U.cols()); + b_running_gradient.setZero(b.size()); + U_running_parameter_update.setZero(U.rows(),U.cols()); + b_running_parameter_update.setZero(b.size()); + } + initMatrix(engine, U, init_normal, init_range); + initBias(engine, b, init_normal, init_range); } int n_inputs () const { return U.cols(); } int n_outputs () const { return U.rows(); } - template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const - { - UNCONST(DerivedOut, output, my_output); - my_output.leftCols(input.cols()).noalias() = U*input; - } + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + my_output.leftCols(input.cols()).noalias() = U*input; + int num_examples = input.cols(); + for (int example = 0;example < num_examples;example++) + { + my_output.leftCols(input.cols()).col(example) += b; + } + } // Sparse input template <typename ScalarIn, typename DerivedOut> - void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const + void fProp(const USCMatrix<ScalarIn> &input, + const MatrixBase<DerivedOut> &output_const) const { UNCONST(DerivedOut, output_const, output); output.setZero(); uscgemm(1.0, U, input, output.leftCols(input.cols())); + // Each column corresponds to a training example. We + // parallelize the adding of biases per dimension. + int num_examples = input.cols(); + for (int example = 0;example < num_examples;example++) + { + output.leftCols(input.cols()).col(example) += b; + } } - template <typename DerivedGOut, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const - { + template <typename DerivedGOut, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOut> &input, + MatrixBase<DerivedGIn> &output) const + { UNCONST(DerivedGIn, output, my_output); my_output.noalias() = U.transpose()*input; } - template <typename DerivedGOut, typename DerivedIn> - void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, double momentum, double L2_reg) + template <typename DerivedGOut, typename DerivedIn> + void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, double momentum, double L2_reg) + { + U_gradient.noalias() = bProp_input*fProp_input.transpose(); + + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient = bProp_input.rowwise().sum(); + // This used to be multithreaded, but there was no measureable difference + if (L2_reg > 0.0) { - U_gradient.noalias() = bProp_input*fProp_input.transpose(); - - // This used to be multithreaded, but there was no measureable difference - if (L2_reg > 0.0) - { - U_gradient *= 1 - 2*L2_reg; - } - if (momentum > 0.0) - { - U_velocity = momentum*U_velocity + U_gradient; - U += learning_rate * U_velocity; - } - else - { - U += learning_rate * U_gradient; - } + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + if (momentum > 0.0) + { + U_velocity = momentum*U_velocity + U_gradient; + U += learning_rate * U_velocity; + b_velocity = momentum*b_velocity + b_gradient; + b += learning_rate * b_velocity; + } + else + { + U += learning_rate * U_gradient; + b += learning_rate * b_gradient; + /* + //UPDATE CLIPPING + U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); + b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); + //GRADIENT CLIPPING + //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix(); + //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); + */ + } } - template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, double momentum, double L2_reg) - { - U_gradient.noalias() = bProp_input*fProp_input.transpose(); + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, + double L2_reg) + { + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - if (L2_reg != 0) - { - U_gradient *= 1 - 2*L2_reg; - } + + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient.noalias() = bProp_input.rowwise().sum(); - // ignore momentum? + if (L2_reg != 0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + + // ignore momentum? + #pragma omp parallel for + for (int col=0; col<U.cols(); col++) { + U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix(); + U.col(col) += learning_rate * (U_gradient.col(col).array() / + U_running_gradient.col(col).array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())). + unaryExpr(Clipper()).matrix(); + */ + } + b_running_gradient += b_gradient.array().square().matrix(); + b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + */ + } - U_running_gradient.array() += U_gradient.array().square(); - U.array() += learning_rate * U_gradient.array() / U_running_gradient.array().sqrt(); - } + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, + double L2_reg, + double conditioning_constant, + double decay) + { + //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl; + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> - void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - const MatrixBase<DerivedGW> &gradient) const - { - UNCONST(DerivedGW, gradient, my_gradient); - my_gradient.noalias() = bProp_input*fProp_input.transpose(); - } + Array<double,Dynamic,1> b_current_parameter_update; + + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + + if (L2_reg != 0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + + // ignore momentum? + #pragma omp parallel for + //cerr<<"U gradient is "<<U_gradient<<endl; + for (int col=0; col<U.cols(); col++) { + Array<double,Dynamic,1> U_current_parameter_update; + U_running_gradient.col(col) = decay*U_running_gradient.col(col) + + (1-decay)*U_gradient.col(col).array().square().matrix(); + //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; + //getchar(); + U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/ + (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) * + U_gradient.col(col).array(); + //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl; + //getchar(); + //update the running parameter update + U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) + + (1.-decay)*U_current_parameter_update.square().matrix(); + U.col(col) += learning_rate*U_current_parameter_update.matrix(); + } + b_running_gradient = decay*b_running_gradient + + (1.-decay)*b_gradient.array().square().matrix(); + b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ + (b_running_gradient.array()+conditioning_constant).sqrt()) * + b_gradient.array(); + b_running_parameter_update = decay*(b_running_parameter_update) + + (1.-decay)*b_current_parameter_update.square().matrix(); + b += learning_rate*b_current_parameter_update.matrix(); + } + + + template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + const MatrixBase<DerivedGW> &gradient) const + { + UNCONST(DerivedGW, gradient, my_gradient); + my_gradient.noalias() = bProp_input*fProp_input.transpose(); + } }; class Output_word_embeddings @@ -149,10 +294,12 @@ class Output_word_embeddings Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; std::vector<double> W_data; Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,Dynamic> W_running_gradient; - Matrix<double,Dynamic,Dynamic> W_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; Matrix<double,Dynamic,1> b_running_gradient; Matrix<double,Dynamic,1> b_gradient; + Matrix<double,Dynamic,1> b_running_parameter_update; public: Output_word_embeddings() { } @@ -160,8 +307,8 @@ class Output_word_embeddings void resize(int rows, int cols) { - W->setZero(rows, cols); - b.setZero(rows); + W->setZero(rows, cols); + b.setZero(rows); } void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { W = input_W; @@ -172,8 +319,31 @@ class Output_word_embeddings void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } template <typename Engine> - void initialize(Engine &engine, bool init_normal, double init_range, double init_bias) + void initialize(Engine &engine, + bool init_normal, + double init_range, + double init_bias, + string ¶meter_update, + double adagrad_epsilon) { + + W_gradient.setZero(W->rows(),W->cols()); + b_gradient.setZero(b.size()); + if (parameter_update == "ADA") { + W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + //W_gradient.setZero(W->rows(),W->cols()); + //b_gradient.setZero(b.size()); + } + if (parameter_update == "ADAD") { + W_running_gradient.setZero(W->rows(),W->cols()); + b_running_gradient.setZero(b.size()); + W_gradient.setZero(W->rows(),W->cols()); + //b_gradient.setZero(b.size()); + //W_running_parameter_update.setZero(W->rows(),W->cols()); + b_running_parameter_update.setZero(b.size()); + } + initMatrix(engine, *W, init_normal, init_range); b.fill(init_bias); } @@ -198,8 +368,12 @@ class Output_word_embeddings UNCONST(DerivedOutV, output, my_output); #pragma omp parallel for for (int instance_id = 0; instance_id < samples.cols(); instance_id++) - for (int sample_id = 0; sample_id < samples.rows(); sample_id++) - my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); + { + for (int sample_id = 0; sample_id < samples.rows(); sample_id++) + { + my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); + } + } USCMatrix<double> sparse_output(W->rows(), samples, my_output); uscgemm_masked(1.0, *W, input, sparse_output); my_output = sparse_output.values; // too bad, so much copying @@ -232,15 +406,86 @@ class Output_word_embeddings void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOut> &bProp_input, double learning_rate, - double momentum) //not sure if we want to use momentum here + double momentum) //not sure if we want to use momentum here { // W is vocab_size x output_embedding_dimension // b is vocab_size x 1 // predicted_embeddings is output_embedding_dimension x minibatch_size // bProp_input is vocab_size x minibatch_size - W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose(); b += learning_rate * bProp_input.rowwise().sum(); + + /* + //GRADIENT CLIPPING + W->noalias() += learning_rate * + ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); + b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); + //UPDATE CLIPPING + W->noalias() += (learning_rate * + (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); + b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); + */ + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradientAdagrad( + const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_sizea + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + W_running_gradient += W_gradient.array().square().matrix(); + b_running_gradient += b_gradient.array().square().matrix(); + W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix(); + b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + */ + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate, + double conditioning_constant, + double decay) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_size + Array<double,Dynamic,Dynamic> W_current_parameter_update; + Array<double,Dynamic,1> b_current_parameter_update; + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + W_running_gradient = decay*W_running_gradient + + (1.-decay)*W_gradient.array().square().matrix(); + b_running_gradient = decay*b_running_gradient+ + (1.-decay)*b_gradient.array().square().matrix(); + W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/ + (W_running_gradient.array()+conditioning_constant).sqrt())* + W_gradient.array(); + b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ + (b_running_gradient.array()+conditioning_constant).sqrt())* + b_gradient.array(); + W_running_parameter_update = decay*W_running_parameter_update + + (1.-decay)*W_current_parameter_update.square().matrix(); + b_running_parameter_update = decay*b_running_parameter_update + + (1.-decay)*b_current_parameter_update.square().matrix(); + + *W += learning_rate*W_current_parameter_update.matrix(); + b += learning_rate*b_current_parameter_update.matrix(); } // Sparse versions @@ -264,6 +509,7 @@ class Output_word_embeddings const MatrixBase<DerivedGOutV> &weights, double learning_rate, double momentum) //not sure if we want to use momentum here { + //cerr<<"in gradient"<<endl; USCMatrix<double> gradient_output(W->rows(), samples, weights); uscgemm(learning_rate, gradient_output, @@ -273,27 +519,64 @@ class Output_word_embeddings gradient_output, Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), b); + /* + //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT + //FIRST + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + //#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + //W->row(update_item) += learning_rate * W_gradient.row(update_item); + //b(update_item) += learning_rate * b_gradient(update_item); + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix(); + double update = learning_rate * b_gradient(update_item); + b(update_item) += std::min(0.5, std::max(update,-0.5)); + //GRADIENT CLIPPING + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; + } + */ + //cerr<<"Finished gradient"<<endl; } template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOutI> &samples, const MatrixBase<DerivedGOutV> &weights, - double learning_rate, double momentum) //not sure if we want to use momentum here + double learning_rate) //not sure if we want to use momentum here { - W_gradient.setZero(W->rows(), W->cols()); - b_gradient.setZero(b.size()); - if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) - W_running_gradient.setZero(W->rows(), W->cols()); - if (b_running_gradient.size() != b.size()) - b_running_gradient.setZero(b.size()); - + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); + //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(learning_rate, + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(learning_rate, gradient_output, + uscgemv(1.0, + gradient_output, Matrix<double,Dynamic,1>::Ones(weights.cols()), b_gradient); @@ -308,16 +591,98 @@ class Output_word_embeddings update_items.push_back(it->first); int num_items = update_items.size(); - #pragma omp parallel for + //#pragma omp parallel for for (int item_id=0; item_id<num_items; item_id++) { int update_item = update_items[item_id]; - W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square(); + W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item); - W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt(); + W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + /* + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix(); + double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5)); + */ + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; } + } + + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, + double conditioning_constant, + double decay) //not sure if we want to use momentum here + { + //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); + + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + Array<double,1,Dynamic> W_current_parameter_update; + double b_current_parameter_update; + + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ + (1.-decay)*W_gradient.row(update_item).array().square().matrix(); + b_running_gradient(update_item) = decay*b_running_gradient(update_item)+ + (1.-decay)*b_gradient(update_item)*b_gradient(update_item); + //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl; + //getchar(); + + //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl; + //getchar(); + W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ + (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* + W_gradient.row(update_item).array(); + b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/ + sqrt(b_running_gradient(update_item)+conditioning_constant))* + b_gradient(update_item); + //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl; + //getchar(); + //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl; + //getchar(); + //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl; + W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ + (1.-decay)*(W_current_parameter_update.square().matrix()); + b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+ + (1.-decay)*b_current_parameter_update*b_current_parameter_update; + //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl; + //getchar(); + W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); + b(update_item) += learning_rate*b_current_parameter_update; + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; } + } + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, @@ -345,8 +710,9 @@ class Input_word_embeddings private: Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; int context_size, vocab_size; - Matrix<double,Dynamic,Dynamic> W_running_gradient; - Matrix<double,Dynamic,Dynamic> W_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; friend class model; @@ -354,29 +720,44 @@ class Input_word_embeddings Input_word_embeddings() : context_size(0), vocab_size(0) { } Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { - W = input_W; - } + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } - void resize(int rows, int cols, int context) - { - context_size = context; - vocab_size = rows; - W->setZero(rows, cols); - } + void resize(int rows, int cols, int context) + { + context_size = context; + vocab_size = rows; + W->setZero(rows, cols); + } void read(std::ifstream &W_file) { readMatrix(W_file, *W); } void write(std::ofstream &W_file) { writeMatrix(*W, W_file); } - template <typename Engine> - void initialize(Engine &engine, bool init_normal, double init_range) - { - initMatrix(engine, - *W, - init_normal, - init_range); + template <typename Engine> + void initialize(Engine &engine, + bool init_normal, + double init_range, + string ¶meter_update, + double adagrad_epsilon) + { + W_gradient.setZero(W->rows(),W->cols()); + + if (parameter_update == "ADA") { + W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + //W_gradient.setZero(W->rows(),W->cols()); + } + if (parameter_update == "ADAD") { + W_running_gradient.setZero(W->rows(),W->cols()); + //W_gradient.setZero(W->rows(),W->cols()); + W_running_parameter_update.setZero(W->rows(),W->cols()); } - + initMatrix(engine, + *W, + init_normal, + init_range); + } + int n_inputs() const { return -1; } int n_outputs() const { return W->cols() * context_size; } @@ -436,7 +817,7 @@ class Input_word_embeddings const MatrixBase<DerivedIn> &input_words, double learning_rate, double momentum, double L2_reg) { - int embedding_dimension = W->cols(); + int embedding_dimension = W->cols(); // W is vocab_size x embedding_dimension // input is ngram_size*vocab_size x minibatch_size @@ -453,59 +834,177 @@ class Input_word_embeddings uscgemm(learning_rate, USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), - *W); + *W); + } + + /* + //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN + //PERFORM CLIPPING WHILE UPDATING + + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; + } + } + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + { + update_items.push_back(it->first); + } + int num_items = update_items.size(); + + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + //UPDATE CLIPPING + W->row(update_item) += (learning_rate* + W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix(); + //GRADIENT CLIPPING + //W->row(update_item) += learning_rate* + // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix(); + //SETTING THE GRADIENT TO ZERO + W_gradient.row(update_item).setZero(); + } + */ } template <typename DerivedGOut, typename DerivedIn> void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &input_words, - double learning_rate, double momentum, double L2_reg) + double learning_rate, + double L2_reg) { int embedding_dimension = W->cols(); - - W_gradient.setZero(W->rows(), W->cols()); - if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) - W_running_gradient.setZero(W->rows(), W->cols()); - + //W_gradient.setZero(W->rows(), W->cols()); + /* + if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) + W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + */ for (int ngram=0; ngram<context_size; ngram++) { - uscgemm(learning_rate, + uscgemm(1.0, USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; + } + } + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + { + update_items.push_back(it->first); + } + int num_items = update_items.size(); + + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); + W->row(update_item) += learning_rate * + (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * + (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) + .unaryExpr(Clipper()).matrix(); + */ + W_gradient.row(update_item).setZero(); + } + } - int_map update_map; //stores all the parameters that have been updated + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, + double L2_reg, + double conditioning_constant, + double decay) + { + int embedding_dimension = W->cols(); - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(train_id)] = 1; - } + //W_gradient.setZero(W->rows(), W->cols()); + /* + if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) + W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + */ + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; + } + } // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square(); - W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt(); - } + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + { + update_items.push_back(it->first); } + int num_items = update_items.size(); - template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> - void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - int x, int minibatch_size, - const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) { + + Array<double,1,Dynamic> W_current_parameter_update; + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ + (1.-decay)*W_gradient.row(update_item).array().square().matrix(); + + W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ + (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* + W_gradient.row(update_item).array(); + + //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl; + //getchar(); + W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ + (1.-decay)*W_current_parameter_update.square().matrix(); + + W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); + //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl; + //getchar(); + W_gradient.row(update_item).setZero(); + } + + } + + template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + int x, int minibatch_size, + const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + { UNCONST(DerivedGW, gradient, my_gradient); int embedding_dimension = W->cols(); my_gradient.setZero(); @@ -514,7 +1013,8 @@ class Input_word_embeddings USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), my_gradient); - } + } }; } // namespace nplm + diff --git a/src/neuralLM.h b/src/neuralLM.h index f451e8a..dc66206 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -2,124 +2,51 @@ #define NEURALLM_H #include <vector> -#include <iostream> -#include <fstream> -#include <memory> -#include <stdexcept> #include <cctype> #include <cstdlib> -#include <boost/lexical_cast.hpp> #include <boost/shared_ptr.hpp> -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> -#include "param.h" #include "util.h" -#include "model.h" -#include "propagator.h" -#include "neuralClasses.h" #include "vocabulary.h" +#include "neuralNetwork.h" + +/* + To do: + - move digit mapping into vocabulary.h + */ namespace nplm { -class neuralLMShared { - public: - vocabulary input_vocab, output_vocab; - model nn; - - explicit neuralLMShared(const std::string &filename, bool premultiply = false) { - std::vector<std::string> input_words, output_words; - nn.read(filename, input_words, output_words); - input_vocab = vocabulary(input_words); - output_vocab = vocabulary(output_words); - // this is faster but takes more memory - if (premultiply) { - nn.premultiply(); - } - } -}; - -class neuralLM +class neuralLM : public neuralNetwork { - // Big stuff shared across instances. - boost::shared_ptr<neuralLMShared> shared; - - bool normalization; char map_digits; - - propagator prop; - - int ngram_size; - int width; - - double weight; - - - std::size_t cache_size; - Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; - std::vector<double> cache_values; - int cache_lookups, cache_hits; - - Eigen::Matrix<int,Eigen::Dynamic,1> ngram; // buffer for lookup_ngram + boost::shared_ptr<vocabulary> vocab; int start, null; public: - neuralLM(const std::string &filename, bool premultiply = false) - : shared(new neuralLMShared(filename, premultiply)), - ngram_size(shared->nn.ngram_size), - normalization(false), - weight(1.), - map_digits(0), - width(1), - prop(shared->nn, 1), - cache_size(0), - start(shared->input_vocab.lookup_word("<s>")), - null(shared->input_vocab.lookup_word("<null>")) - { - ngram.setZero(ngram_size); - if (cache_size) - { - cache_keys.resize(ngram_size, cache_size); - cache_keys.fill(-1); - } - prop.resize(); + neuralLM() + : neuralNetwork(), + vocab(new vocabulary()), + map_digits(0) + { } - void set_normalization(bool value) { normalization = value; } - void set_log_base(double value) { weight = 1./std::log(value); } void set_map_digits(char value) { map_digits = value; } - void set_width(int width) + void set_vocabulary(const vocabulary &vocab) { - this->width = width; - prop.resize(width); + *(this->vocab) = vocab; + start = vocab.lookup_word("<s>"); + null = vocab.lookup_word("<null>"); } - const vocabulary &get_vocabulary() const { return shared->input_vocab; } - - int lookup_input_word(const std::string &word) const - { - if (map_digits) - for (int i=0; i<word.length(); i++) - if (isdigit(word[i])) - { - std::string mapped_word(word); - for (; i<word.length(); i++) - if (isdigit(word[i])) - mapped_word[i] = map_digits; - return shared->input_vocab.lookup_word(mapped_word); - } - return shared->input_vocab.lookup_word(word); - } + const vocabulary &get_vocabulary() const { return *(this->vocab); } int lookup_word(const std::string &word) const { - return lookup_input_word(word); - } - - int lookup_output_word(const std::string &word) const - { if (map_digits) for (int i=0; i<word.length(); i++) if (isdigit(word[i])) @@ -128,133 +55,17 @@ public: for (; i<word.length(); i++) if (isdigit(word[i])) mapped_word[i] = map_digits; - return shared->output_vocab.lookup_word(mapped_word); + return vocab->lookup_word(mapped_word); } - return shared->output_vocab.lookup_word(word); - } - - Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram; } - double lookup_from_staging() { - return lookup_ngram(ngram); - } - - template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) - { - assert (ngram.rows() == ngram_size); - assert (ngram.cols() == 1); - - std::size_t hash; - if (cache_size) - { - // First look in cache - hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h - cache_lookups++; - if (cache_keys.col(hash) == ngram) - { - cache_hits++; - return cache_values[hash]; - } - } - - // Make sure that we're single threaded. Multithreading doesn't help, - // and in some cases can hurt quite a lot - int save_threads = omp_get_max_threads(); - omp_set_num_threads(1); - int save_eigen_threads = Eigen::nbThreads(); - Eigen::setNbThreads(1); - #ifdef __INTEL_MKL__ - int save_mkl_threads = mkl_get_max_threads(); - mkl_set_num_threads(1); - #endif - - prop.fProp(ngram.col(0)); - - int output = ngram(ngram_size-1, 0); - double log_prob; - - start_timer(3); - if (normalization) - { - Eigen::Matrix<double,Eigen::Dynamic,1> scores(shared->output_vocab.size()); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - double logz = logsum(scores.col(0)); - log_prob = weight * (scores(output, 0) - logz); - } - else - { - if (prop.skip_hidden) - log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); - else - log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); - } - stop_timer(3); - - if (cache_size) - { - // Update cache - cache_keys.col(hash) = ngram; - cache_values[hash] = log_prob; - } - - #ifdef __INTEL_MKL__ - mkl_set_num_threads(save_mkl_threads); - #endif - Eigen::setNbThreads(save_eigen_threads); - omp_set_num_threads(save_threads); - - return log_prob; - } - - // Look up many n-grams in parallel. - template <typename DerivedA, typename DerivedB> - void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) - { - UNCONST(DerivedB, log_probs_const, log_probs); - assert (ngram.rows() == ngram_size); - assert (ngram.cols() <= width); - - prop.fProp(ngram); - - if (normalization) - { - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(shared->output_vocab.size(), ngram.cols()); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - - // And softmax and loss - Matrix<double,Dynamic,Dynamic> output_probs(shared->nn.output_vocab_size, ngram.cols()); - double minibatch_log_likelihood; - SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(shared->nn.ngram_size-1), output_probs, minibatch_log_likelihood); - for (int j=0; j<ngram.cols(); j++) - { - int output = ngram(ngram_size-1, j); - log_probs(0, j) = weight * output_probs(output, j); - } - } - else - { - for (int j=0; j<ngram.cols(); j++) - { - int output = ngram(ngram_size-1, j); - if (prop.skip_hidden) - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); - else - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); - } - } + return vocab->lookup_word(word); } double lookup_ngram(const int *ngram_a, int n) { - for (int i=0; i<ngram_size; i++) + Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); + for (int i=0; i<m->ngram_size; i++) { - if (i-ngram_size+n < 0) + if (i-m->ngram_size+n < 0) { if (ngram_a[0] == start) ngram(i) = start; @@ -263,10 +74,10 @@ public: } else { - ngram(i) = ngram_a[i-ngram_size+n]; + ngram(i) = ngram_a[i-m->ngram_size+n]; } } - return lookup_ngram(ngram); + return neuralNetwork::lookup_ngram(ngram); } double lookup_ngram(const std::vector<int> &ngram_v) @@ -274,20 +85,26 @@ public: return lookup_ngram(ngram_v.data(), ngram_v.size()); } - int get_order() const { return ngram_size; } - - void set_cache(std::size_t cache_size) + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) { - this->cache_size = cache_size; - cache_keys.resize(ngram_size, cache_size); - cache_keys.fill(-1); // clears cache - cache_values.resize(cache_size); - cache_lookups = cache_hits = 0; + return neuralNetwork::lookup_ngram(ngram); + } + + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); } - double cache_hit_rate() + void read(const std::string &filename) { - return static_cast<double>(cache_hits)/cache_lookups; + std::vector<std::string> words; + m->read(filename, words); + set_vocabulary(vocabulary(words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); } }; @@ -314,10 +131,13 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu } } -inline void preprocessWords(const std::vector<std::string> &words, std::vector< std::vector<int> > &ngrams, - int ngram_size, const vocabulary &vocab, - bool numberize, bool add_start_stop, bool ngramize) -{ +inline void preprocessWords(const std::vector<std::string> &words, + std::vector< std::vector<int> > &ngrams, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize) { int start = vocab.lookup_word("<s>"); int stop = vocab.lookup_word("</s>"); diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h new file mode 100644 index 0000000..021a425 --- /dev/null +++ b/src/neuralNetwork.h @@ -0,0 +1,188 @@ +#ifndef NEURALNETWORK_H +#define NEURALNETWORK_H + +#include <vector> +#include <boost/shared_ptr.hpp> +#include <Eigen/Dense> + +#include "util.h" +#include "model.h" +#include "propagator.h" +#include "neuralClasses.h" + +namespace nplm +{ + +class neuralNetwork +{ +protected: + boost::shared_ptr<model> m; + +private: + bool normalization; + double weight; + + propagator prop; + + std::size_t cache_size; + Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; + std::vector<double> cache_values; + int cache_lookups, cache_hits; + +public: + neuralNetwork() + : m(new model()), + normalization(false), + weight(1.), + prop(*m, 1), + cache_size(0) + { + } + + void set_normalization(bool value) { normalization = value; } + void set_log_base(double value) { weight = 1./std::log(value); } + + // This must be called if the underlying model is resized. + void resize() { + if (cache_size) + { + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); + } + prop.resize(); + } + + void set_width(int width) + { + prop.resize(width); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + assert (ngram.rows() == m->ngram_size); + assert (ngram.cols() == 1); + + std::size_t hash; + if (cache_size) + { + // First look in cache + hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h + cache_lookups++; + if (cache_keys.col(hash) == ngram) + { + cache_hits++; + return cache_values[hash]; + } + } + + // Make sure that we're single threaded. Multithreading doesn't help, + // and in some cases can hurt quite a lot + int save_threads = omp_get_max_threads(); + omp_set_num_threads(1); + int save_eigen_threads = Eigen::nbThreads(); + Eigen::setNbThreads(1); + #ifdef __INTEL_MKL__ + int save_mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); + #endif + + prop.fProp(ngram.col(0)); + + int output = ngram(m->ngram_size-1, 0); + double log_prob; + + start_timer(3); + if (normalization) + { + Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + double logz = logsum(scores.col(0)); + log_prob = weight * (scores(output, 0) - logz); + } + else + { + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); + } + stop_timer(3); + + if (cache_size) + { + // Update cache + cache_keys.col(hash) = ngram; + cache_values[hash] = log_prob; + } + + #ifdef __INTEL_MKL__ + mkl_set_num_threads(save_mkl_threads); + #endif + Eigen::setNbThreads(save_eigen_threads); + omp_set_num_threads(save_threads); + + return log_prob; + } + + // Look up many n-grams in parallel. + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + UNCONST(DerivedB, log_probs_const, log_probs); + assert (ngram.rows() == m->ngram_size); + //assert (ngram.cols() <= prop.get_minibatch_size()); + + prop.fProp(ngram); + + if (normalization) + { + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + + // And softmax and loss + Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); + double minibatch_log_likelihood; + SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(m->ngram_size-1, j); + log_probs(0, j) = weight * output_probs(output, j); + } + } + else + { + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(m->ngram_size-1, j); + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + } + } + } + + int get_order() const { return m->ngram_size; } + + void read(const std::string &filename) + { + m->read(filename); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } + + void set_cache(std::size_t cache_size) + { + this->cache_size = cache_size; + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); // clears cache + cache_values.resize(cache_size); + cache_lookups = cache_hits = 0; + } + + double cache_hit_rate() + { + return static_cast<double>(cache_hits)/cache_lookups; + } + +}; + +} // namespace nplm + +#endif diff --git a/src/neuralTM.cpp b/src/neuralTM.cpp new file mode 100644 index 0000000..630ef58 --- /dev/null +++ b/src/neuralTM.cpp @@ -0,0 +1 @@ +#include "neuralTM.h" diff --git a/src/neuralTM.h b/src/neuralTM.h new file mode 100644 index 0000000..7476d91 --- /dev/null +++ b/src/neuralTM.h @@ -0,0 +1,133 @@ +#ifndef NEURALTM_H +#define NEURALTM_H + +#include <vector> +#include <cctype> +#include <cstdlib> +#include <boost/shared_ptr.hpp> + +#include <Eigen/Dense> + +#include "util.h" +#include "vocabulary.h" +#include "neuralNetwork.h" + +namespace nplm +{ + +class neuralTM : public neuralNetwork +{ + char map_digits; + boost::shared_ptr<vocabulary> input_vocab, output_vocab; + int start, null; + +public: + neuralTM() + : neuralNetwork(), + map_digits(0), + input_vocab(new vocabulary()), + output_vocab(new vocabulary()) + { + } + + void set_map_digits(char value) { map_digits = value; } + + void set_input_vocabulary(const vocabulary &vocab) + { + *(this->input_vocab) = vocab; + start = vocab.lookup_word("<s>"); + null = vocab.lookup_word("<null>"); + } + + void set_output_vocabulary(const vocabulary &vocab) + { + *(this->output_vocab) = vocab; + } + + const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); } + const vocabulary &get_output_vocabulary() const { return *(this->input_vocab); } + + int lookup_input_word(const std::string &word) const + { + if (map_digits) + for (int i=0; i<word.length(); i++) + if (isdigit(word[i])) + { + std::string mapped_word(word); + for (; i<word.length(); i++) + if (isdigit(word[i])) + mapped_word[i] = map_digits; + return input_vocab->lookup_word(mapped_word); + } + return input_vocab->lookup_word(word); + } + + int lookup_output_word(const std::string &word) const + { + if (map_digits) + for (int i=0; i<word.length(); i++) + if (isdigit(word[i])) + { + std::string mapped_word(word); + for (; i<word.length(); i++) + if (isdigit(word[i])) + mapped_word[i] = map_digits; + return output_vocab->lookup_word(mapped_word); + } + return output_vocab->lookup_word(word); + } + + double lookup_ngram(const int *ngram_a, int n) + { + Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); + for (int i=0; i<m->ngram_size; i++) + { + if (i-m->ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-m->ngram_size+n]; + } + } + return neuralNetwork::lookup_ngram(ngram); + } + + double lookup_ngram(const std::vector<int> &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + return neuralNetwork::lookup_ngram(ngram); + } + + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); + } + + void read(const std::string &filename) + { + std::vector<std::string> input_words; + std::vector<std::string> output_words; + m->read(filename, input_words, output_words); + set_input_vocabulary(vocabulary(input_words)); + set_output_vocabulary(vocabulary(output_words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } + +}; + +} // namespace nplm + +#endif diff --git a/src/param.h b/src/param.h index 8e42853..0615690 100644 --- a/src/param.h +++ b/src/param.h @@ -1,3 +1,4 @@ +//The framework for obtaining user arguments has been inspired by Sittichai Jiampojamarn's Many-to-Many alignment model (m2m-aligner). https://code.google.com/p/m2m-aligner/ #pragma once #include <string> @@ -18,7 +19,6 @@ struct param std::string input_words_file; std::string output_words_file; std::string model_prefix; - std::string init_model; int ngram_size; int vocab_size; @@ -30,12 +30,15 @@ struct param int output_embedding_dimension; std::string activation_function; std::string loss_function; + std::string parameter_update; int minibatch_size; int validation_minibatch_size; int num_epochs; double learning_rate; - + double conditioning_constant; + double decay; + double adagrad_epsilon; bool init_normal; double init_range; @@ -57,3 +60,4 @@ struct param }; } // namespace nplm + diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp index 94482d0..13a534a 100644 --- a/src/prepareNeuralLM.cpp +++ b/src/prepareNeuralLM.cpp @@ -1,9 +1,20 @@ #include <iostream> #include <vector> #include <queue> -#include <boost/unordered_map.hpp> -#include <tclap/CmdLine.h> -#include <boost/algorithm/string/join.hpp> +#include <deque> +# include <fstream> +# include <iterator> + +# include <boost/unordered_map.hpp> +# include <boost/algorithm/string/join.hpp> +# include <boost/interprocess/managed_shared_memory.hpp> +# include <boost/interprocess/allocators/allocator.hpp> +# include <boost/interprocess/managed_mapped_file.hpp> +#include <boost/interprocess/containers/vector.hpp> +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_int_distribution.hpp> + +# include <tclap/CmdLine.h> #include "neuralLM.h" #include "util.h" @@ -12,12 +23,27 @@ using namespace std; using namespace TCLAP; using namespace boost; using namespace nplm; +using namespace boost::random; +namespace ip = boost::interprocess; + +typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator; +typedef ip::vector<int, intAllocator> vec; +typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocator; +//typedef allocator<ValueType, managed_shared_memory::segment_manager> ShmemAllocator; +//typedef multimap<int, vec, std::less<int>, ShmemAllocator> MyMap; +typedef std::vector<vec,vecAllocator> vecvec; -void writeNgrams(const vector<vector<string> > &data, - int ngram_size, const vocabulary &vocab, - bool numberize, bool add_start_stop, bool ngramize, +typedef long long int data_size_t; // training data can easily exceed 2G instances + +template<typename T> +void writeNgrams(const T &data, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, const string &filename) -{ + { ofstream file(filename.c_str()); if (!file) { @@ -26,6 +52,7 @@ void writeNgrams(const vector<vector<string> > &data, } vector<vector<int> > ngrams; + for (int i=0; i<data.size(); i++) { preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize); // write out n-grams @@ -41,11 +68,233 @@ void writeNgrams(const vector<vector<string> > &data, file.close(); } +// Space efficient version for writing the n-grams. +// They are not read into memory. +void writeNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + int train_data_size, + vector<float> &sent_weights, + const string &sent_weights_filename) +{ + ofstream file(filename.c_str()); + ofstream output_sent_weights_file(sent_weights_filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + ifstream input_file(input_filename.c_str()); + vector<vector<int> > ngrams; + //for (int i=0; i<train_data.size(); i++) { + string line; + int counter = 0; + cerr<<"Processed ... "; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) == 0) { + cerr<<counter<<" training lines ... "; + } + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); + + //for (int i=0; i<data.size(); i++) { + preprocessWords(lstr_items, + ngrams, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize); + + // write out n-grams + for (int j=0; j<ngrams.size(); j++) + { + if (sent_weights.size() != 0) { + output_sent_weights_file <<sent_weights[counter-1]<<endl; + } + for (int k=0; k<ngram_size; k++) + { + file << ngrams[j][k] << " "; + } + file << endl; + } + } + cerr<<endl; + input_file.close(); + file.close(); + output_sent_weights_file.close(); +} + +// Space efficient version for writing the n-grams. +// They are not read into memory. +void writeMmapNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + unsigned long train_data_size, + data_size_t num_tokens, + bool randomize) +{ + cerr<<"Num tokens is "<<num_tokens<<endl; + cerr<<"Training data size is "<<train_data_size<<endl; + // Open the memory mapped file and create the allocators + ip::managed_mapped_file mfile(ip::create_only, + filename.c_str(), + num_tokens*ngram_size*sizeof(int)+1024UL*1024UL); + intAllocator ialloc(mfile.get_segment_manager()); + vecAllocator valloc (mfile.get_segment_manager()); + //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc); + + vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc); + + cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl; + // Going over every line in the input file and + // printing the memory mapped ngrams into the + // output file + ifstream input_file(input_filename.c_str()); + //for (int i=0; i<train_data.size(); i++) { + string line; + int counter = 0; + cerr<<"Processed ... "; + long int train_ngram_counter = 0; + vector<vector<int> > ngrams; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) ==0) { + //cerr<<"counter is "<<counter<<endl; + cerr<<counter<<" training lines ... "; + } + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); + + //for (int i=0; i<data.size(); i++) { + preprocessWords(lstr_items, ngrams, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize); + /* + cerr<<"line is "<<endl; + cerr<<line<<endl; + cerr<<"Number of ngrams is "<<ngrams.size()<<endl; + if (ngrams.size() ==1 ){ + cerr<<"The line number was "<<counter<<endl; + cerr<<line<<endl; + } + */ + // write out n-grams in mmapped file + for (int j=0; j<ngrams.size(); j++) + { + /* + for (int k=0; k<ngram_size; k++) + { + cerr << ngrams[j][k] << " "; + } + cerr<< endl; + */ + for (int k=0; k<ngram_size; k++) { + mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k]; + } + train_ngram_counter++; + //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl; + } + } + cerr<<endl; + input_file.close(); + + // Shrink the file if it was overused + ip::managed_mapped_file::shrink_to_fit(filename.c_str()); + //now to randomize the items if the randomize flag was set + if (randomize == true) { + unsigned seed = 1234; //for testing only + mt19937 rng(seed); + cerr<<"Randomly shuffling data..."; + data_size_t counter =0; + while (counter < num_tokens) { + data_size_t upper_limit = counter+5000000; + long int vector_size = 5000000; + if (counter + 10000000 >= num_tokens) { + upper_limit = num_tokens; + vector_size = num_tokens - counter; + } + vector<int> temp(vector_size*ngram_size,0); + for (int i=0;i<vector_size;i++){ + for (int k=0;k<ngram_size;k++) { + temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k); + } + } + for (data_size_t i=vector_size-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<num_tokens-1<<" instances..."; + } + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<ngram_size;k++) { + int temp_val = temp.at(i*ngram_size+k); + temp.at(i*ngram_size+k) = + temp.at(j*ngram_size+k); + temp.at(j*ngram_size+k) = temp_val; + } + } + //Putting it back + for (int i=0;i<vector_size;i++){ + for (int k=0;k<ngram_size;k++) { + mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; + } + } + counter = upper_limit; + } + + /* + for (data_size_t i=num_tokens-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<num_tokens-1<<" instances..."; + } + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<ngram_size;k++) { + int temp_val = mMapVec->at(i*ngram_size+k); + mMapVec->at(i*ngram_size+k) = + mMapVec->at(j*ngram_size+k); + mMapVec->at(j*ngram_size+k) = temp_val; + } + } + */ + cerr<<endl; + } +} + + int main(int argc, char *argv[]) { + ios::sync_with_stdio(false); int ngram_size, vocab_size, validation_size; - bool numberize, ngramize, add_start_stop; - string train_text, train_file, validation_text, validation_file, words_file, write_words_file; + bool numberize, + ngramize, + add_start_stop, + mmap_file, + randomize; + + string train_text, + train_file, + validation_text, + validation_file, + words_file, + write_words_file, + sent_weights_text, + output_sent_weights_text; try { @@ -56,6 +305,10 @@ int main(int argc, char *argv[]) ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is " + "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); + + ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd); ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd); ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); @@ -66,6 +319,10 @@ int main(int argc, char *argv[]) ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); + //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); + //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); + + cmd.parse(argc, argv); @@ -81,6 +338,13 @@ int main(int argc, char *argv[]) numberize = arg_numberize.getValue(); ngramize = arg_ngramize.getValue(); add_start_stop = arg_add_start_stop.getValue(); + mmap_file = arg_mmap_file.getValue(); + randomize = arg_randomize.getValue(); + //sent_weights_text = arg_sent_weights_text.getValue(); + //output_sent_weights_text = arg_sent_weights_file.getValue(); + sent_weights_text = ""; + output_sent_weights_text = ""; + // check command line arguments @@ -114,6 +378,8 @@ int main(int argc, char *argv[]) cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl; + //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl; } catch (TCLAP::ArgException &e) { @@ -130,24 +396,123 @@ int main(int argc, char *argv[]) // } // Read in training data and validation data - vector<vector<string> > train_data; - readSentFile(train_text, train_data); - for (int i=0; i<train_data.size(); i++) { - // if data is already ngramized, set/check ngram_size - if (!ngramize) { - if (ngram_size > 0) { - if (ngram_size != train_data[i].size()) { - cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=train_data[i].size(); - } + // vector<vector<string> > train_data; + // readSentFile(train_text, train_data); + // @vaswani: No more reading the entire training file into memory + // Reading it per line with file io + + //for (int i=0; i<train_data.size(); i++) { + // Go over every line in the file and + // 1. if the !ngramize then you should check if + // we have the correct number of items per line + // 2. build the vocabulary if the words file has not + // been specified. + // Construct vocabulary + vocabulary vocab; + int start, stop; + // Add start stop if the vocabulary has not been supplied + if (words_file == "") { + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; + } + } + if (mmap_file == false && randomize == true) { + cerr<<"Randomize option can only be used with mmap_file = 1"<<endl; + exit(1); + } + unordered_map<string,int> count; // For keeping word counts if no supplied vocab + + deque<vector<string> > validation_data; + int train_data_size=0; + cerr<<"Processed ... "; + data_size_t num_tokens=0; + + ifstream training(train_text.c_str()); + + string line; + while (getline(training,line)) { + train_data_size++; + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + if (ngram_size > 0) { + if (ngram_size != lstr_items.size()) { + cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; + } + } + // else if --ngram_size has not been specified, set it now + else { + ngram_size=lstr_items.size(); + } + } + if ((train_data_size%100000)==0){ + cerr<<train_data_size<<" lines ... "; + } + //break; + /* + if (lstr_items.size() ==1) { + cerr<<"line :"<<endl; + cerr<<line<<endl; + cerr<<"The number of items was 1"<<endl; + getchar(); + } + */ + num_tokens += lstr_items.size()+1; + if (words_file == "") { + for (int j=0; j<lstr_items.size(); j++) { + count[lstr_items[j]] += 1; + } + } + // Add to validation set if the validation size + // has not been specified + if (validation_text == "" && validation_size > 0) { + //cerr<<"validation size is "<<validation_data.size()<<endl; + if (validation_data.size() == validation_size) { + //validation_data.erase(validation_data.begin()); + validation_data.pop_front(); } + validation_data.push_back(lstr_items); + } + } + cerr<<endl; + training.close(); + //cerr<<"validation size is "<<validation_data.size()<<endl; + //getchar(); + if (validation_data.size() < validation_size) { + cerr<<"validation size is "<<validation_data.size()<<endl; + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); } - vector<vector<string> > validation_data; + train_data_size -= validation_size; + cerr<<"Training data size is "<<train_data_size<<endl; + + // The items in the validation data have already been counted + // Decrementing the counts of those words before building the vocabulary + for(int i=0; i<validation_data.size(); i++){ + num_tokens -= (validation_data[i].size() +1); + for (int j=0; j<validation_data[i].size();j++){ + count[validation_data[i][j]] -= 1; + if (count[validation_data[i][j]] == 0) { + count.erase(validation_data[i][j]); + } + } + } + + // Getting the top n frequent words for the vocabulary + if (words_file == "") { + vocab.insert_most_frequent(count, vocab_size); + if (vocab.size() < vocab_size) { + cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; + } + } + //vector<vector<string> > validation_data; if (validation_text != "") { readSentFile(validation_text, validation_data); for (int i=0; i<validation_data.size(); i++) { @@ -166,22 +531,37 @@ int main(int argc, char *argv[]) } } } + //READING SENTENCE WEIGHTS IF THERE ARE ANY + vector<float> sent_weights; + if (sent_weights_text != "") { + cerr<<"Reading sentence weights from "<<sent_weights_text<<endl; + ifstream sent_weights_file(sent_weights_text.c_str()); + string line; + readWeightsFile(sent_weights_file,sent_weights); + sent_weights_file.close(); + if (sent_weights_text.size() != train_data_size) { + cerr<<"The number of sentence weights does not match the number of training sentences"<<endl; + } + } + + /* else if (validation_size > 0) { - // Create validation data - if (validation_size > train_data.size()) - { - cerr << "error: requested validation size is greater than training data size" << endl; - exit(1); - } - validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); - train_data.resize(train_data.size() - validation_size); + // Create validation data + if (validation_size > train_data.size()) + { + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); + } + validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); + train_data.resize(train_data.size() - validation_size); } + */ // Construct vocabulary - vocabulary vocab; - int start, stop; - + //vocabulary vocab; + //int start, stop; + // read vocabulary from file if (words_file != "") { vector<string> words; @@ -202,12 +582,12 @@ int main(int argc, char *argv[]) } } - + /* // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk> else { - vocab.insert_word("<s>"); - vocab.insert_word("</s>"); - vocab.insert_word("<null>"); + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); // warn user that if --numberize is not set, there will be no vocabulary! if (!numberize) { @@ -225,6 +605,7 @@ int main(int argc, char *argv[]) cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; } } + */ // write vocabulary to file if (write_words_file != "") { @@ -236,11 +617,39 @@ int main(int argc, char *argv[]) if (train_file != "") { cerr << "Writing training data to " << train_file << endl; - writeNgrams(train_data, ngram_size, vocab, numberize, add_start_stop, ngramize, train_file); + if (mmap_file == true) { + writeMmapNgrams(train_text, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + train_file, + train_data_size, + num_tokens, + randomize); + } else { + writeNgrams(train_text, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + train_file, + train_data_size, + sent_weights, + output_sent_weights_text); + } } if (validation_file != "") { cerr << "Writing validation data to " << validation_file << endl; - writeNgrams(validation_data, ngram_size, vocab, numberize, add_start_stop, ngramize, validation_file); + writeNgrams(validation_data, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + validation_file); } } diff --git a/src/prepareNeuralTM.cpp b/src/prepareNeuralTM.cpp index 8d7cbf8..0c30fd0 100644 --- a/src/prepareNeuralTM.cpp +++ b/src/prepareNeuralTM.cpp @@ -14,7 +14,7 @@ using namespace TCLAP; using namespace boost; using namespace nplm; -void writeNgrams(const vector<vector<string> > &input_data, const vector<vector<string> > &output_data, int ngram_size, const vocabulary &input_vocab, const vocabulary &output_vocab, bool numberize, bool ngramize, const string &filename) +void writeNgrams(const vector<vector<string> > &data, int source_context_size, int target_context_size, const vocabulary &input_vocab, int source_unk, const vocabulary &output_vocab, bool numberize, const string &filename) { ofstream file(filename.c_str()); if (!file) @@ -23,107 +23,37 @@ void writeNgrams(const vector<vector<string> > &input_data, const vector<vector< exit(1); } - // check that input and output data have the same number of sentences - if (input_data.size() != output_data.size()) { - cerr << "Error: input and output data files have different number of lines" << endl; - exit(1); - } + int ngram_size = source_context_size + target_context_size + 1; // for each input and output line - int lines=input_data.size(); - if (numberize) { - for (int i=0; i<lines; i++) { - // convert each line to a set of ngrams - vector<vector<int> > input_ngrams; - vector<int> input_nums; - for (int j=0; j<input_data[i].size(); j++) { - input_nums.push_back(input_vocab.lookup_word(input_data[i][j])); + for (int i=0; i<data.size(); i++) { + vector<int> nums; + if (numberize) { + for (int j=0; j<source_context_size; j++) { + nums.push_back(input_vocab.lookup_word(data[i][j], source_unk)); } - makeNgrams(input_nums, input_ngrams, ngram_size-1); - - vector<vector<int> > output_ngrams; - vector<int> output_nums; - for (int j=0; j<output_data[i].size(); j++) { - output_nums.push_back(output_vocab.lookup_word(output_data[i][j])); - } - makeNgrams(output_nums, output_ngrams, 1); - - // print out cross product of input and output ngrams - for (int j=0; j < input_ngrams.size(); j++) { - for (int k=0; k < output_ngrams.size(); k++) { - int j_prime; - for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) { - file << input_ngrams[j][j_prime] << " "; - } - file << input_ngrams[j][j_prime]; - int k_prime; - for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) { - file << " " << output_ngrams[k][k_prime]; - } - file << endl; - } + for (int j=source_context_size; j<ngram_size-1; j++) { + nums.push_back(input_vocab.lookup_word(data[i][j])); } - } - } - - else { - for (int i=0; i<lines; i++) { - // convert each line to a set of ngrams - vector<vector<string> > input_ngrams; - vector<string> input_words; - for (int j=0; j<input_data[i].size(); j++) { - int unk = input_vocab.lookup_word("<unk>"); - // if word is unknown - if (input_vocab.lookup_word(input_data[i][j]) == unk) { - input_words.push_back("<unk>"); - } - // if word is known - else { - input_words.push_back(input_data[i][j]); - } + nums.push_back(output_vocab.lookup_word(data[i][ngram_size-1])); + } else { + for (int j=0; j<ngram_size-1; j++) { + nums.push_back(lexical_cast<int>(data[i][j])); } - makeNgrams(input_words, input_ngrams, ngram_size-1); - - vector<vector<string> > output_ngrams; - vector<string> output_words; - for (int j=0; j<output_data[i].size(); j++) { - int unk = output_vocab.lookup_word("<unk>"); - // if word is unknown - if (output_vocab.lookup_word(output_data[i][j]) == unk) { - output_words.push_back("<unk>"); - } - // if word is known - else { - output_words.push_back(output_data[i][j]); - } - } - makeNgrams(output_words, output_ngrams, 1); - - // print out cross product of input and output ngrams - for (int j=0; j < input_ngrams.size(); j++) { - for (int k=0; k < output_ngrams.size(); k++) { - int j_prime; - for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) { - file << input_ngrams[j][j_prime] << " "; - } - file << input_ngrams[j][j_prime]; - int k_prime; - for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) { - file << " " << output_ngrams[k][k_prime]; - } - file << endl; - } - } - } + nums.push_back(lexical_cast<int>(data[i][ngram_size-1])); + } + for (int k=0; k<nums.size(); k++) + file << nums[k] << " "; + file << endl; } file.close(); } int main(int argc, char *argv[]) { - int ngram_size, input_vocab_size, output_vocab_size, validation_size; - bool add_start_stop, numberize, ngramize; - string input_train_text, output_train_text, train_file, input_validation_text, output_validation_text, validation_file, write_input_words_file, write_output_words_file, input_words_file, output_words_file; + int source_context_size, target_context_size, input_vocab_size, output_vocab_size, validation_size; + bool numberize; + string train_text, train_file, validation_text, validation_file, write_input_words_file, write_output_words_file, input_words_file, output_words_file; try { @@ -131,45 +61,37 @@ int main(int argc, char *argv[]) // The options are printed in reverse order - ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend (ngram_size-1) start symbols and postpend 1 stop symbol. Default: true.", false, true, "bool", cmd); ValueArg<int> arg_input_vocab_size("", "input_vocab_size", "Vocabulary size.", false, -1, "int", cmd); ValueArg<int> arg_output_vocab_size("", "output_vocab_size", "Vocabulary size.", false, -1, "int", cmd); ValueArg<string> arg_input_words_file("", "input_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); ValueArg<string> arg_output_words_file("", "output_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); - ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd); + ValueArg<int> arg_source_context_size("", "source_context_size", "Size of input context.", true, -1, "int", cmd); + ValueArg<int> arg_target_context_size("", "target_context_size", "Size of output context.", true, -1, "int", cmd); ValueArg<string> arg_write_input_words_file("", "write_input_words_file", "Output vocabulary.", false, "", "string", cmd); ValueArg<string> arg_write_output_words_file("", "write_output_words_file", "Output vocabulary.", false, "", "string", cmd); ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd); ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); - ValueArg<string> arg_input_validation_text("", "input_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); - ValueArg<string> arg_output_validation_text("", "output_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); - ValueArg<string> arg_input_train_text("", "input_train_text", "Input training data (tokenized).", true, "", "string", cmd); - ValueArg<string> arg_output_train_text("", "output_train_text", "Input training data (tokenized).", true, "", "string", cmd); + ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); cmd.parse(argc, argv); - input_train_text = arg_input_train_text.getValue(); - output_train_text = arg_output_train_text.getValue(); + train_text = arg_train_text.getValue(); train_file = arg_train_file.getValue(); validation_file = arg_validation_file.getValue(); - input_validation_text = arg_input_validation_text.getValue(); - output_validation_text = arg_output_validation_text.getValue(); - input_validation_text = arg_input_validation_text.getValue(); - output_validation_text = arg_output_validation_text.getValue(); + validation_text = arg_validation_text.getValue(); validation_size = arg_validation_size.getValue(); write_input_words_file = arg_write_input_words_file.getValue(); write_output_words_file = arg_write_output_words_file.getValue(); - ngram_size = arg_ngram_size.getValue(); + source_context_size = arg_source_context_size.getValue(); + target_context_size = arg_target_context_size.getValue(); input_vocab_size = arg_input_vocab_size.getValue(); output_vocab_size = arg_output_vocab_size.getValue(); input_words_file = arg_input_words_file.getValue(); output_words_file = arg_output_words_file.getValue(); numberize = arg_numberize.getValue(); - ngramize = arg_ngramize.getValue(); - add_start_stop = arg_add_start_stop.getValue(); // check command line arguments @@ -188,34 +110,24 @@ int main(int argc, char *argv[]) exit(1); } - // Notes: - // - if --ngramize 0 is set, then - // - if --ngram_size is not set, it is inferred from the training file (different from current) - // - if --ngram_size is set, it is an error if the training file has a different n-gram size - // - if neither --validation_file or --validation_size is set, validation will not be performed. - // - if --numberize 0 is set, then --validation_size cannot be used. - cerr << "Command line: " << endl; cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; const string sep(" Value: "); - cerr << arg_input_train_text.getDescription() << sep << arg_input_train_text.getValue() << endl; - cerr << arg_output_train_text.getDescription() << sep << arg_output_train_text.getValue() << endl; + cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; - cerr << arg_input_validation_text.getDescription() << sep << arg_input_validation_text.getValue() << endl; - cerr << arg_output_validation_text.getDescription() << sep << arg_output_validation_text.getValue() << endl; + cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; cerr << arg_write_input_words_file.getDescription() << sep << arg_write_input_words_file.getValue() << endl; cerr << arg_write_output_words_file.getDescription() << sep << arg_write_output_words_file.getValue() << endl; - cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; + cerr << arg_source_context_size.getDescription() << sep << arg_source_context_size.getValue() << endl; + cerr << arg_target_context_size.getDescription() << sep << arg_target_context_size.getValue() << endl; cerr << arg_input_vocab_size.getDescription() << sep << arg_input_vocab_size.getValue() << endl; cerr << arg_output_vocab_size.getDescription() << sep << arg_output_vocab_size.getValue() << endl; cerr << arg_input_words_file.getDescription() << sep << arg_input_words_file.getValue() << endl; cerr << arg_output_words_file.getDescription() << sep << arg_output_words_file.getValue() << endl; cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; - cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; - cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; } catch (TCLAP::ArgException &e) { @@ -223,74 +135,32 @@ int main(int argc, char *argv[]) exit(1); } + string start(string("<s>")), stop(string("</s>")); + // Read in input training data and validation data - vector<vector<string> > input_train_data; - readSentFile(input_train_text, input_train_data); - if (add_start_stop) { - for (int i=0; i<input_train_data.size(); i++) { - vector<string> input_train_data_start_stop; - addStartStop<string>(input_train_data[i], input_train_data_start_stop, ngram_size, "<s>", "</s>"); - input_train_data[i]=input_train_data_start_stop; - } - } + vector<vector<string> > train_data; + readSentFile(train_text, train_data); - vector<vector<string> > input_validation_data; - if (input_validation_text != "") { - readSentFile(input_validation_text, input_validation_data); - if (add_start_stop) { - for (int i=0; i<input_validation_data.size(); i++) { - vector<string> input_validation_data_start_stop; - addStartStop<string>(input_validation_data[i], input_validation_data_start_stop, ngram_size, "<s>", "</s>"); - input_validation_data[i]=input_validation_data_start_stop; - } - } + vector<vector<string> > validation_data; + if (validation_text != "") { + readSentFile(validation_text, validation_data); } else if (validation_size > 0) { - if (validation_size > input_train_data.size()) + if (validation_size > train_data.size()) { - cerr << "error: requested input_validation size is greater than training data size" << endl; + cerr << "error: requested validation size is greater than training data size" << endl; exit(1); } - input_validation_data.insert(input_validation_data.end(), input_train_data.end()-validation_size, input_train_data.end()); - input_train_data.resize(input_train_data.size() - validation_size); + validation_data.insert(validation_data.end(), train_data.end() - validation_size, train_data.end()); + train_data.resize(train_data.size() - validation_size); } - // Read in output training data and validation data - vector<vector<string> > output_train_data; - readSentFile(output_train_text, output_train_data); - if (add_start_stop) { - for (int i=0; i<output_train_data.size(); i++) { - vector<string> output_train_data_start_stop; - addStartStop<string>(output_train_data[i], output_train_data_start_stop, 1, "<s>", "</s>"); - output_train_data[i]=output_train_data_start_stop; - } - } - - vector<vector<string> > output_validation_data; - if (output_validation_text != "") { - readSentFile(output_validation_text, output_validation_data); - if (add_start_stop) { - for (int i=0; i<output_validation_data.size(); i++) { - vector<string> output_validation_data_start_stop; - addStartStop<string>(output_validation_data[i], output_validation_data_start_stop, 1, "<s>", "</s>"); - output_validation_data[i]=output_validation_data_start_stop; - } - } - } - else if (validation_size > 0) - { - if (validation_size > output_train_data.size()) - { - cerr << "error: requested output_validation size is greater than training data size" << endl; - exit(1); - } - output_validation_data.insert(output_validation_data.end(), output_train_data.end()-validation_size, output_train_data.end()); - output_train_data.resize(output_train_data.size() - validation_size); - } + int ngram_size = source_context_size + target_context_size + 1; // Construct input vocabulary vocabulary input_vocab; + int source_unk = input_vocab.insert_word("<source_unk>"); int input_start = input_vocab.insert_word("<s>"); int input_stop = input_vocab.insert_word("</s>"); input_vocab.insert_word("<null>"); @@ -298,7 +168,7 @@ int main(int argc, char *argv[]) // read input vocabulary from file if (input_words_file != "") { vector<string> words; - readWordsFile(input_words_file,words); + readWordsFile(input_words_file, words); for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { input_vocab.insert_word(*it); } @@ -317,9 +187,9 @@ int main(int argc, char *argv[]) // or construct input vocabulary to contain top <input_vocab_size> most frequent words; all other words replaced by <unk> else { unordered_map<string,int> count; - for (int i=0; i<input_train_data.size(); i++) { - for (int j=0; j<input_train_data[i].size(); j++) { - count[input_train_data[i][j]] += 1; + for (int i=0; i<train_data.size(); i++) { + for (int j=0; j<ngram_size-1; j++) { + count[train_data[i][j]] += 1; } } @@ -333,12 +203,11 @@ int main(int argc, char *argv[]) vocabulary output_vocab; int output_start = output_vocab.insert_word("<s>"); int output_stop = output_vocab.insert_word("</s>"); - output_vocab.insert_word("<null>"); // read output vocabulary from file if (output_words_file != "") { vector<string> words; - readWordsFile(output_words_file,words); + readWordsFile(output_words_file, words); for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { output_vocab.insert_word(*it); } @@ -357,10 +226,8 @@ int main(int argc, char *argv[]) // or construct output vocabulary to contain top <output_vocab_size> most frequent words; all other words replaced by <unk> else { unordered_map<string,int> count; - for (int i=0; i<output_train_data.size(); i++) { - for (int j=0; j<output_train_data[i].size(); j++) { - count[output_train_data[i][j]] += 1; - } + for (int i=0; i<train_data.size(); i++) { + count[train_data[i][ngram_size-1]] += 1; } output_vocab.insert_most_frequent(count, output_vocab_size); @@ -385,12 +252,12 @@ int main(int argc, char *argv[]) if (train_file != "") { cerr << "Writing training data to " << train_file << endl; - writeNgrams(input_train_data, output_train_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, train_file); + writeNgrams(train_data, source_context_size, target_context_size, input_vocab, source_unk, output_vocab, numberize, train_file); } if (validation_file != "") { cerr << "Writing validation data to " << validation_file << endl; - writeNgrams(input_validation_data, output_validation_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, validation_file); + writeNgrams(validation_data, source_context_size, target_context_size, input_vocab, source_unk, output_vocab, numberize, validation_file); } } diff --git a/src/propagator.h b/src/propagator.h index c52a6a9..df8a7c2 100644 --- a/src/propagator.h +++ b/src/propagator.h @@ -14,7 +14,7 @@ using Eigen::Dynamic; class propagator { int minibatch_size; - const model *pnn; + model *pnn; public: Node<Input_word_embeddings> input_layer_node; @@ -23,24 +23,21 @@ public: Node<Linear_layer> second_hidden_linear_node; Node<Activation_function> second_hidden_activation_node; Node<Output_word_embeddings> output_layer_node; - bool skip_hidden; public: propagator () : minibatch_size(0), pnn(0) { } - propagator (const model &nn, int minibatch_size) + propagator (model &nn, int minibatch_size) : pnn(&nn), - // These are const for purposes of querying. The issue is that it's also used non-const for purposes of training, so X* only takes mutable classes. - input_layer_node(const_cast<Input_word_embeddings*>(&nn.input_layer), minibatch_size), - first_hidden_linear_node(const_cast<Linear_layer*>(&nn.first_hidden_linear), minibatch_size), - first_hidden_activation_node(const_cast<Activation_function*>(&nn.first_hidden_activation), minibatch_size), - second_hidden_linear_node(const_cast<Linear_layer*>(&nn.second_hidden_linear), minibatch_size), - second_hidden_activation_node(const_cast<Activation_function*>(&nn.second_hidden_activation), minibatch_size), - output_layer_node(const_cast<Output_word_embeddings*>(&nn.output_layer), minibatch_size), + input_layer_node(&nn.input_layer, minibatch_size), + first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), + first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), + second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), + second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), + output_layer_node(&nn.output_layer, minibatch_size), minibatch_size(minibatch_size) { - skip_hidden = (nn.num_hidden == 0); } // This must be called if the underlying model is resized. @@ -81,17 +78,17 @@ public: } first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, first_hidden_activation_node.fProp_matrix); + //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl; + //std::getchar(); stop_timer(1); - if (!skip_hidden) { start_timer(2); second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, second_hidden_linear_node.fProp_matrix); second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, second_hidden_activation_node.fProp_matrix); stop_timer(2); - } // The propagation stops here because the last layer is very expensive. } @@ -100,7 +97,12 @@ public: template <typename DerivedIn, typename DerivedOut> void bProp(const MatrixBase<DerivedIn> &data, const MatrixBase<DerivedOut> &output, - double learning_rate, double momentum, double L2_reg) + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) { // Output embedding layer @@ -110,113 +112,225 @@ public: stop_timer(7); start_timer(8); - if (skip_hidden) { - output_layer_node.param->computeGradient(first_hidden_activation_node.fProp_matrix, - output, - learning_rate, momentum); - } - else { - output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, - output, - learning_rate, momentum); - } + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + output, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix, + output, + learning_rate); + } else if (parameter_update == "ADAD") { + //std::cerr<<"Adadelta gradient"<<endl; + int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols(); + output_layer_node.param->computeGradientAdadelta(second_hidden_activation_node.fProp_matrix, + output, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + } stop_timer(8); - bPropRest(data, learning_rate, momentum, L2_reg); + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); } // Sparse version (for NCE log-likelihood) template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> void bProp(const MatrixBase<DerivedIn> &data, - const MatrixBase<DerivedOutI> &samples, const MatrixBase<DerivedOutV> &weights, - double learning_rate, double momentum, double L2_reg) + const MatrixBase<DerivedOutI> &samples, + const MatrixBase<DerivedOutV> &weights, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) { // Output embedding layer start_timer(7); - output_layer_node.param->bProp(samples, weights, - output_layer_node.bProp_matrix); + output_layer_node.param->bProp(samples, + weights, + output_layer_node.bProp_matrix); stop_timer(7); start_timer(8); - if (skip_hidden) { - output_layer_node.param->computeGradient(first_hidden_activation_node.fProp_matrix, - samples, weights, - learning_rate, momentum); - } - else { - output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, - samples, weights, - learning_rate, momentum); - } + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<<endl; + output_layer_node.param->computeGradientAdadelta(second_hidden_activation_node.fProp_matrix, + samples, + weights, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + } + stop_timer(8); - bPropRest(data, learning_rate, momentum, L2_reg); + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); } private: template <typename DerivedIn> void bPropRest(const MatrixBase<DerivedIn> &data, - double learning_rate, double momentum, double L2_reg) + double learning_rate, double momentum, double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) { // Second hidden layer - if (skip_hidden) { - start_timer(9); - first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, - first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); - - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(9); - } - else { - start_timer(9); - second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, - second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); - - second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.bProp_matrix); - stop_timer(9); - - start_timer(10); - second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, momentum, L2_reg); - stop_timer(10); - - // First hidden layer - - start_timer(11); - first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, - first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); - - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(11); - } - - start_timer(12); - first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - learning_rate, momentum, L2_reg); - stop_timer(12); - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, - data, - learning_rate, momentum, L2_reg); - stop_timer(13); + + // All the compute gradient functions are together and the backprop + // functions are together + ////////BACKPROP//////////// + start_timer(9); + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + + + second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.bProp_matrix); + stop_timer(9); + + start_timer(11); + first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(11); + //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl; + //std::getchar(); + ////COMPUTE GRADIENT///////// + if (parameter_update == "SGD") { + start_timer(10); + second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + momentum, + L2_reg); + stop_timer(10); + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, momentum, L2_reg); + stop_timer(13); + } else if (parameter_update == "ADA") { + start_timer(10); + second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(10); + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, + L2_reg); + stop_timer(13); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<<endl; + start_timer(10); + second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(10); + //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl; + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(12); + + //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl; + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix, + data, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(13); + + //std::cerr<<"Finished gradient for first input layer"<<std::endl; + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + } } }; @@ -224,3 +338,4 @@ private: } // namespace nplm #endif + diff --git a/src/python/nplm.pyx b/src/python/nplm.pyx index 290d56c..9f882ae 100644 --- a/src/python/nplm.pyx +++ b/src/python/nplm.pyx @@ -6,7 +6,7 @@ cdef class NeuralLM: self.thisptr.set_normalization(normalization) self.thisptr.set_log_base(10.) if type(map_digits) is str and len(map_digits) == 1: - self.thisptr.set_map_digits(map_digits) + self.thisptr.set_map_digits((<char *>map_digits)[0]) if cache_size: self.thisptr.set_cache(cache_size) diff --git a/src/python/nptm.pxd b/src/python/nptm.pxd new file mode 100644 index 0000000..bb185ac --- /dev/null +++ b/src/python/nptm.pxd @@ -0,0 +1,25 @@ +from libcpp.string cimport string +from libcpp.vector cimport vector + +cdef extern from "neuralTM.h": + cdef cppclass c_neuralTM "nplm::neuralTM": + c_neuralTM() + void set_normalization(bint) + void set_map_digits(char) + void set_log_base(double) + void read(string filename) except + + int get_order() + int lookup_input_word(string) + int lookup_output_word(string) + float lookup_ngram(vector[int]) + float lookup_ngram(int *, int) + void set_cache(int) + double cache_hit_rate() + +cdef class NeuralTM: + cdef c_neuralTM *thisptr + cdef int c_lookup_input_word(self, char *s) + cdef int c_lookup_output_word(self, char *s) + cdef float c_lookup_ngram(self, int *words, int n) + cdef readonly int order + diff --git a/src/python/nptm.pyx b/src/python/nptm.pyx new file mode 100644 index 0000000..61338a1 --- /dev/null +++ b/src/python/nptm.pyx @@ -0,0 +1,46 @@ +# distutils: language = c++ + +cdef class NeuralTM: + def __cinit__(self, normalization=False, map_digits=None, cache_size=0): + self.thisptr = new c_neuralTM() + self.thisptr.set_normalization(normalization) + self.thisptr.set_log_base(10.) + if type(map_digits) is str and len(map_digits) == 1: + self.thisptr.set_map_digits((<char *>map_digits)[0]) + if cache_size: + self.thisptr.set_cache(cache_size) + + def read(self, filename): + self.thisptr.read(filename) + self.order = self.thisptr.get_order() + + def get_order(self): + return self.thisptr.get_order() + + def lookup_input_word(self, s): + return self.thisptr.lookup_input_word(s) + + def lookup_output_word(self, s): + return self.thisptr.lookup_output_word(s) + + def lookup_ngram(self, words): + if len(words) == 0: + raise ValueError("ngram is empty") + return self.thisptr.lookup_ngram(words) + + def cache_hit_rate(self): + return self.thisptr.cache_hit_rate() + + # low-level interface that can be called by other Cython modules + cdef int c_lookup_input_word(self, char *s): + cdef string ss + ss.assign(s) + return self.thisptr.lookup_input_word(ss) + + cdef int c_lookup_output_word(self, char *s): + cdef string ss + ss.assign(s) + return self.thisptr.lookup_output_word(ss) + + cdef float c_lookup_ngram(self, int *words, int n): + return self.thisptr.lookup_ngram(words, n) diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp index 5805022..834e253 100644 --- a/src/testNeuralLM.cpp +++ b/src/testNeuralLM.cpp @@ -2,10 +2,11 @@ #include <fstream> #include <boost/algorithm/string/join.hpp> +//#include <boost/thread/thread.hpp> #include <tclap/CmdLine.h> -#include "../3rdparty/Eigen/Core" -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Core> +#include <Eigen/Dense> #include "param.h" @@ -18,6 +19,47 @@ using namespace Eigen; using namespace nplm; +void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams, + vector<double> &out) { + if (ngrams.size() == 0) return; + int ngram_size = ngrams[0].size(); + + if (minibatch_size == 0) + { + // Score one n-gram at a time. This is how the LM would be queried from a decoder. + for (int sent_id=0; sent_id<start.size()-1; sent_id++) + { + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += lm.lookup_ngram(ngrams[j]); + out.push_back(sent_log_prob); + } + } + else + { + // Score a whole minibatch at a time. + Matrix<double,1,Dynamic> log_probs(ngrams.size()); + + Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); + minibatch.setZero(); + for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) + { + int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; + for (int j=0; j<current_minibatch_size; j++) + minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); + lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); + } + + for (int sent_id=0; sent_id<start.size()-1; sent_id++) + { + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += log_probs[j]; + out.push_back(sent_log_prob); + } + } +} + int main (int argc, char *argv[]) { param myParam; @@ -78,7 +120,8 @@ int main (int argc, char *argv[]) ///// Create language model - neuralLM lm(myParam.model_file); + neuralLM lm; + lm.read(myParam.model_file); lm.set_normalization(normalization); lm.set_log_base(10); lm.set_cache(1048576); @@ -89,8 +132,6 @@ int main (int argc, char *argv[]) ///// Read test data - double log_likelihood = 0.0; - ifstream test_file(myParam.test_file.c_str()); if (!test_file) { @@ -115,44 +156,33 @@ int main (int argc, char *argv[]) } start.push_back(ngrams.size()); - if (minibatch_size == 0) - { - // Score one n-gram at a time. This is how the LM would be queried from a decoder. - for (int sent_id=0; sent_id<start.size()-1; sent_id++) - { - double sent_log_prob = 0.0; - for (int j=start[sent_id]; j<start[sent_id+1]; j++) - sent_log_prob += lm.lookup_ngram(ngrams[j]); - cout << sent_log_prob << endl; - log_likelihood += sent_log_prob; - } - } - else - { - // Score a whole minibatch at a time. - Matrix<double,1,Dynamic> log_probs(ngrams.size()); - - Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); - minibatch.setZero(); - for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) - { - int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; - for (int j=0; j<current_minibatch_size; j++) - minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); - lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); - } + int num_threads = 1; + vector< vector<double> > sent_log_probs(num_threads); - for (int sent_id=0; sent_id<start.size()-1; sent_id++) - { - double sent_log_prob = 0.0; - for (int j=start[sent_id]; j<start[sent_id+1]; j++) - sent_log_prob += log_probs[j]; - cout << sent_log_prob << endl; - log_likelihood += sent_log_prob; - } + /* + // Test thread safety + boost::thread_group tg; + for (int t=0; t < num_threads; t++) { + tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm + } + tg.join_all(); + */ + score(lm, minibatch_size, start, ngrams, sent_log_probs[0]); + + vector<double> log_likelihood(num_threads); + std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0); + for (int i=0; i<sent_log_probs[0].size(); i++) { + for (int t=0; t<num_threads; t++) + cout << sent_log_probs[t][i] << "\t"; + cout << endl; + for (int t=0; t<num_threads; t++) + log_likelihood[t] += sent_log_probs[t][i]; } - cerr << "Test log10-likelihood: " << log_likelihood << endl; + cerr << "Test log10-likelihood: "; + for (int t=0; t<num_threads; t++) + cerr << log_likelihood[t] << " "; + cerr << endl; #ifdef USE_CHRONO cerr << "Propagation times:"; for (int i=0; i<timer.size(); i++) diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp index 1b4820e..3e1640e 100644 --- a/src/testNeuralNetwork.cpp +++ b/src/testNeuralNetwork.cpp @@ -80,7 +80,7 @@ int main (int argc, char *argv[]) cerr << "Number of test instances: " << test_data_size << endl; Map< Matrix<int,Dynamic,Dynamic> > test_data(test_data_flat.data(), myParam.ngram_size, test_data_size); - + ///// Score test data int num_batches = (test_data_size-1)/myParam.minibatch_size + 1; @@ -101,10 +101,7 @@ int main (int argc, char *argv[]) prop.fProp(minibatch.topRows(myParam.ngram_size-1)); // Do full forward prop through output word embedding layer - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); // And softmax and loss double minibatch_log_likelihood; diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp index 57323d9..e231c20 100644 --- a/src/trainNeuralNetwork.cpp +++ b/src/trainNeuralNetwork.cpp @@ -11,9 +11,13 @@ #include <boost/lexical_cast.hpp> #include <boost/random/mersenne_twister.hpp> #include <boost/algorithm/string/join.hpp> +# include <boost/interprocess/managed_shared_memory.hpp> +# include <boost/interprocess/allocators/allocator.hpp> +# include <boost/interprocess/managed_mapped_file.hpp> +#include <boost/interprocess/containers/vector.hpp> -#include "../3rdparty/Eigen/Dense" -#include "../3rdparty/Eigen/Sparse" +#include <Eigen/Dense> +#include <Eigen/Sparse> #include "maybe_omp.h" #include <tclap/CmdLine.h> @@ -36,16 +40,24 @@ using namespace boost::random; using namespace nplm; +namespace ip = boost::interprocess; typedef unordered_map<Matrix<int,Dynamic,1>, double> vector_map; +typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator; +typedef ip::vector<int, intAllocator> vec; +typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocator; + + typedef long long int data_size_t; // training data can easily exceed 2G instances int main(int argc, char** argv) { + ios::sync_with_stdio(false); + bool use_mmap_file, randomize; param myParam; try { // program options // - CmdLine cmd("Trains a two-layer neural probabilistic language model.", ' ' , "0.1"); + CmdLine cmd("Trains a two-layer neural probabilistic language model.", ' ' , "0.3\n",""); // The options are printed in reverse order @@ -60,14 +72,23 @@ int main(int argc, char** argv) ValueArg<double> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "double", cmd); ValueArg<bool> normalization("", "normalization", "Learn individual normalization factors during training. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); - ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 25.", false, 25, "int", cmd); + ValueArg<bool> mmap_file("", "mmap_file", "Use memory mapped files. This is useful if the entire data cannot fit in memory. prepareNeuralLM can generate memory mapped files", false, 0, "bool", cmd); + + ValueArg<bool> arg_randomize("", "randomize", "Randomize training instances for better training. 1 = yes, 0 = no. Default: 1.", false, true, "bool", cmd); + + ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 100.", false, 100, "int", cmd); ValueArg<double> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "double", cmd); - ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 0.01.", false, 0.01, "double", cmd); + ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 1.", false, 1., "double", cmd); + + ValueArg<double> conditioning_constant("", "conditioning_constant", "Constant to condition the RMS of the expected square of the gradient in ADADELTA. Default: 10E-3.", false, 10E-3, "double", cmd); + ValueArg<double> decay("", "decay", "Decay for ADADELTA. Default: 0.95", false, 0.95, "double", cmd); + ValueArg<double> adagrad_epsilon("", "adagrad_epsilon", "Constant to initialize the L2 squared norm of the gradients with.\ + Default: 10E-3", false, 10E-3, "double", cmd); ValueArg<int> validation_minibatch_size("", "validation_minibatch_size", "Minibatch size for validation. Default: 64.", false, 64, "int", cmd); - ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 64.", false, 64, "int", cmd); + ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 1000.", false, 1000, "int", cmd); ValueArg<int> num_epochs("", "num_epochs", "Number of epochs. Default: 10.", false, 10, "int", cmd); @@ -90,21 +111,29 @@ int main(int argc, char** argv) ValueArg<string> model_prefix("", "model_prefix", "Prefix for output model files." , false, "", "string", cmd); ValueArg<string> words_file("", "words_file", "Vocabulary." , false, "", "string", cmd); + ValueArg<string> parameter_update("", "parameter_update", "parameter update type.\n Stochastic Gradient Descent(SGD)\n \ + ADAGRAD(ADA)\n \ + ADADELTA(ADAD)" , false, "SGD", "string", cmd); ValueArg<string> input_words_file("", "input_words_file", "Vocabulary." , false, "", "string", cmd); ValueArg<string> output_words_file("", "output_words_file", "Vocabulary." , false, "", "string", cmd); ValueArg<string> validation_file("", "validation_file", "Validation data (one numberized example per line)." , false, "", "string", cmd); ValueArg<string> train_file("", "train_file", "Training data (one numberized example per line)." , true, "", "string", cmd); - ValueArg<string> init_model("", "init_model", "Initialize parameters from existing model (to continue interrupted training)", false, "", "string", cmd); + + ValueArg<string> model_file("", "model_file", "Model file.", false, "", "string", cmd); + cmd.parse(argc, argv); // define program parameters // + use_mmap_file = mmap_file.getValue(); + randomize = arg_randomize.getValue(); + myParam.model_file = model_file.getValue(); myParam.train_file = train_file.getValue(); myParam.validation_file = validation_file.getValue(); myParam.input_words_file = input_words_file.getValue(); myParam.output_words_file = output_words_file.getValue(); if (words_file.getValue() != "") - myParam.input_words_file = myParam.output_words_file = words_file.getValue(); + myParam.input_words_file = myParam.output_words_file = words_file.getValue(); myParam.model_prefix = model_prefix.getValue(); @@ -112,9 +141,9 @@ int main(int argc, char** argv) myParam.vocab_size = vocab_size.getValue(); myParam.input_vocab_size = input_vocab_size.getValue(); myParam.output_vocab_size = output_vocab_size.getValue(); - if (vocab_size.getValue() >= 0) - myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); - + if (vocab_size.getValue() >= 0) { + myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); + } myParam.num_hidden = num_hidden.getValue(); myParam.activation_function = activation_function.getValue(); myParam.loss_function = loss_function.getValue(); @@ -125,13 +154,17 @@ int main(int argc, char** argv) myParam.input_embedding_dimension = input_embedding_dimension.getValue(); myParam.output_embedding_dimension = output_embedding_dimension.getValue(); - if (embedding_dimension.getValue() >= 0) + if (embedding_dimension.getValue() >= 0) { myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue(); + } myParam.minibatch_size = minibatch_size.getValue(); myParam.validation_minibatch_size = validation_minibatch_size.getValue(); myParam.num_epochs= num_epochs.getValue(); myParam.learning_rate = learning_rate.getValue(); + myParam.conditioning_constant = conditioning_constant.getValue(); + myParam.decay = decay.getValue(); + myParam.adagrad_epsilon = adagrad_epsilon.getValue(); myParam.use_momentum = use_momentum.getValue(); myParam.share_embeddings = share_embeddings.getValue(); myParam.normalization = normalization.getValue(); @@ -140,8 +173,8 @@ int main(int argc, char** argv) myParam.L2_reg = L2_reg.getValue(); myParam.init_normal= init_normal.getValue(); myParam.init_range = init_range.getValue(); - myParam.init_model = init_model.getValue(); myParam.normalization_init = normalization_init.getValue(); + myParam.parameter_update = parameter_update.getValue(); cerr << "Command line: " << endl; cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; @@ -156,72 +189,70 @@ int main(int argc, char** argv) cerr << ngram_size.getDescription() << sep << ngram_size.getValue() << endl; cerr << input_vocab_size.getDescription() << sep << input_vocab_size.getValue() << endl; cerr << output_vocab_size.getDescription() << sep << output_vocab_size.getValue() << endl; + cerr << mmap_file.getDescription() << sep << mmap_file.getValue() << endl; if (embedding_dimension.getValue() >= 0) { - cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; + cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; } else { - cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; - cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; + cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; + cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; } cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl; if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue()) { - cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; - exit(1); + cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; + exit(1); } cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl; if (string_to_activation_function(activation_function.getValue()) == InvalidFunction) { - cerr << "error: invalid activation function: " << activation_function.getValue() << endl; - exit(1); + cerr << "error: invalid activation function: " << activation_function.getValue() << endl; + exit(1); } cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl; if (string_to_loss_function(loss_function.getValue()) == InvalidLoss) { - cerr << "error: invalid loss function: " << loss_function.getValue() << endl; - exit(1); + cerr << "error: invalid loss function: " << loss_function.getValue() << endl; + exit(1); } cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl; - if (init_model.getValue() != "") { - cerr << init_model.getDescription() << sep << init_model.getValue() << endl; - } - else { - cerr << init_normal.getDescription() << sep << init_normal.getValue() << endl; - cerr << init_range.getDescription() << sep << init_range.getValue() << endl; - } + cerr << init_normal.getDescription() << sep << init_normal.getValue() << endl; + cerr << init_range.getDescription() << sep << init_range.getValue() << endl; cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl; cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; - if (myParam.validation_file != "") - cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; + if (myParam.validation_file != "") { + cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; + } cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl; cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl; cerr << num_noise_samples.getDescription() << sep << num_noise_samples.getValue() << endl; cerr << normalization.getDescription() << sep << normalization.getValue() << endl; - if (myParam.normalization) - cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; + if (myParam.normalization){ + cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; + } cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl; if (myParam.use_momentum) { - cerr << initial_momentum.getDescription() << sep << initial_momentum.getValue() << endl; - cerr << final_momentum.getDescription() << sep << final_momentum.getValue() << endl; + cerr << initial_momentum.getDescription() << sep << initial_momentum.getValue() << endl; + cerr << final_momentum.getDescription() << sep << final_momentum.getValue() << endl; } cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; if (unigram_probs_file.getValue() != "") { - cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; + cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; } } catch (TCLAP::ArgException &e) @@ -241,13 +272,114 @@ int main(int argc, char** argv) ///////////////////////////////////////////////////////////////////////////////////// // Read training data + vector<int> training_data_flat; - readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size); - data_size_t training_data_size = training_data_flat.size() / myParam.ngram_size; + vec * training_data_flat_mmap; + data_size_t training_data_size; //num_tokens; + ip::managed_mapped_file mmap_file; + if (use_mmap_file == false) { + cerr<<"Reading data from regular text file "<<endl; + readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size); + training_data_size = training_data_flat.size()/myParam.ngram_size; + } else { + cerr<<"Using mmaped file"<<endl; + mmap_file = ip::managed_mapped_file(ip::open_only,myParam.train_file.c_str()); + training_data_flat_mmap = mmap_file.find<vec>("vector").first; + cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl; + training_data_size = training_data_flat_mmap->size()/myParam.ngram_size; + //randomly shuffle the data for better learning. The shuffling will + //be different for a standard stl vector + // Randomly shuffle training data to improve learning + if (randomize == true) { + cerr<<"Randomly shuffling data..."; + data_size_t counter =0; + while (counter < training_data_size) { + data_size_t upper_limit = counter+5000000; + long int vector_size = 5000000; + if (counter + 10000000 >= training_data_size) { + upper_limit = training_data_size; + vector_size = training_data_size - counter; + } + vector<int> temp(vector_size*myParam.ngram_size,0); + for (int i=0;i<vector_size;i++){ + for (int k=0;k<myParam.ngram_size;k++) { + temp[i*myParam.ngram_size+k] = training_data_flat_mmap->at((i+counter)*myParam.ngram_size+k); + } + } + /* + for (data_size_t i=upper_limit; i>counter; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<training_data_size-1<<" instances..."; + } + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<myParam.ngram_size;k++) { + int temp_val = training_data_flat_mmap->at(i*myParam.ngram_size+k); + training_data_flat_mmap->at(i*myParam.ngram_size+k) = + training_data_flat_mmap->at(j*myParam.ngram_size+k); + training_data_flat_mmap->at(j*myParam.ngram_size+k) = temp_val; + } + } + */ + for (data_size_t i=vector_size-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<training_data_size-1<<" instances..."; + } + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<myParam.ngram_size;k++) { + int temp_val = temp.at(i*myParam.ngram_size+k); + temp.at(i*myParam.ngram_size+k) = + temp.at(j*myParam.ngram_size+k); + temp.at(j*myParam.ngram_size+k) = temp_val; + } + } + //Putting it back + for (int i=0;i<vector_size;i++){ + for (int k=0;k<myParam.ngram_size;k++) { + training_data_flat_mmap->at((i+counter)*myParam.ngram_size+k) = temp[i*myParam.ngram_size+k]; + } + } + counter = upper_limit; + } + /* + for (data_size_t i=training_data_size-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<training_data_size-1<<" instances..."; + } + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<myParam.ngram_size;k++) { + int temp_val = training_data_flat_mmap->at(i*myParam.ngram_size+k); + training_data_flat_mmap->at(i*myParam.ngram_size+k) = + training_data_flat_mmap->at(j*myParam.ngram_size+k); + training_data_flat_mmap->at(j*myParam.ngram_size+k) = temp_val; + } + } + */ + cerr<<endl; + } + } + //cerr<<"Num tokens "<<num_tokens<<endl; + //data_size_t training_data_size = num_tokens / myParam.ngram_size; cerr << "Number of training instances: "<< training_data_size << endl; - - Map< Matrix<int,Dynamic,Dynamic> > training_data(training_data_flat.data(), myParam.ngram_size, training_data_size); - + + Matrix<int,Dynamic,Dynamic> training_data; + //(training_data_flat.data(), myParam.ngram_size, training_data_size); + + #ifdef MAP + cerr<<"Setting up eigen map"<<endl; + if (use_mmap_file == false) { + training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size); + } else { + training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size); + } + cerr<<"Created eigen map"<<endl; + #else + if (use_mmap_file == false) { + training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size); + } + #endif // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index if (myParam.input_vocab_size == 0 and myParam.input_words_file == "") { @@ -255,16 +387,18 @@ int main(int argc, char** argv) } // If neither --output_vocab_size nor --output_words_file is given, set output_vocab_size to the maximum word index - if (myParam.output_vocab_size == 0 and myParam.words_file == "") + if (myParam.output_vocab_size == 0 and myParam.output_words_file == "") { myParam.output_vocab_size = training_data.row(myParam.ngram_size-1).maxCoeff()+1; } - - // Randomly shuffle training data to improve learning - for (data_size_t i=training_data_size-1; i>0; i--) - { - data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); - training_data.col(i).swap(training_data.col(j)); + if (use_mmap_file == false && randomize == true) { + cerr<<"Randomly shuffling data..."<<endl; + // Randomly shuffle training data to improve learning + for (data_size_t i=training_data_size-1; i>0; i--) + { + data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng); + training_data.col(i).swap(training_data.col(j)); + } } // Read validation data @@ -273,9 +407,9 @@ int main(int argc, char** argv) if (myParam.validation_file != "") { - readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat); - validation_data_size = validation_data_flat.size() / myParam.ngram_size; - cerr << "Number of validation instances: " << validation_data_size << endl; + readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat); + validation_data_size = validation_data_flat.size() / myParam.ngram_size; + cerr << "Number of validation instances: " << validation_data_size << endl; } Map< Matrix<int,Dynamic,Dynamic> > validation_data(validation_data_flat.data(), myParam.ngram_size, validation_data_size); @@ -303,28 +437,41 @@ int main(int argc, char** argv) vector<data_size_t> unigram_counts(myParam.output_vocab_size); for (data_size_t train_id=0; train_id < training_data_size; train_id++) { - int output_word = training_data(myParam.ngram_size-1, train_id); - unigram_counts[output_word] += 1; + int output_word; + if (use_mmap_file == false) { + output_word = training_data(myParam.ngram_size-1, train_id); + } else { + //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl; + output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1); + } + //cerr<<"output word is "<<output_word<<endl; + unigram_counts[output_word] += 1; } multinomial<data_size_t> unigram (unigram_counts); ///// Create and initialize the neural network and associated propagators. - - model nn(myParam.ngram_size, - myParam.input_vocab_size, - myParam.output_vocab_size, - myParam.input_embedding_dimension, - myParam.num_hidden, - myParam.output_embedding_dimension, - myParam.share_embeddings); - - if (myParam.init_model != "") { - nn.read(myParam.init_model); - } - else { - nn.initialize(rng, myParam.init_normal, myParam.init_range, -log(myParam.output_vocab_size)); + model nn; + // IF THE MODEL FILE HAS BEEN DEFINED, THEN + // LOAD THE NEURAL NETWORK MODEL + if (myParam.model_file != ""){ + nn.read(myParam.model_file); + cerr<<"reading the model"<<endl; + } else { + nn.resize(myParam.ngram_size, + myParam.input_vocab_size, + myParam.output_vocab_size, + myParam.input_embedding_dimension, + myParam.num_hidden, + myParam.output_embedding_dimension); + + nn.initialize(rng, + myParam.init_normal, + myParam.init_range, + -log(myParam.output_vocab_size), + myParam.parameter_update, + myParam.adagrad_epsilon); + nn.set_activation_function(string_to_activation_function(myParam.activation_function)); } - nn.set_activation_function(string_to_activation_function(myParam.activation_function)); loss_function_type loss_function = string_to_loss_function(myParam.loss_function); propagator prop(nn, myParam.minibatch_size); @@ -360,14 +507,14 @@ int main(int argc, char** argv) if (myParam.normalization) { - for (data_size_t i=0;i<training_data_size;i++) - { - Matrix<int,Dynamic,1> context = training_data.block(0,i,ngram_size-1,1); - if (c_h.find(context) == c_h.end()) - { - c_h[context] = -myParam.normalization_init; - } - } + for (data_size_t i=0;i<training_data_size;i++) + { + Matrix<int,Dynamic,1> context = training_data.block(0,i,ngram_size-1,1); + if (c_h.find(context) == c_h.end()) + { + c_h[context] = -myParam.normalization_init; + } + } } for (int epoch=0; epoch<myParam.num_epochs; epoch++) @@ -403,9 +550,29 @@ int main(int argc, char** argv) } data_size_t minibatch_start_index = minibatch_size * batch; - int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index); - Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); + int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index); + #ifdef MAP + Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); + #else + //ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file + Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size); + //cerr<<"Minibatch start index "<<minibatch_start_index<<endl; + //cerr<<"Minibatch size "<<current_minibatch_size<<endl; + if (use_mmap_file == true) { + minibatch.setZero(ngram_size,current_minibatch_size); + //now reading the ngrams from the mmaped file + for (int k=0; k<ngram_size; k++){ + for (data_size_t index = 0 ; index<current_minibatch_size; index++) { + data_size_t current_index = index + minibatch_start_index; + //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl; + minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k); + } + } + } else { + minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); + } + #endif double adjusted_learning_rate = current_learning_rate/current_minibatch_size; //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl; @@ -426,103 +593,106 @@ int main(int argc, char** argv) if (loss_function == NCELoss) { - ///// Noise-contrastive estimation + ///// Noise-contrastive estimation - // Generate noise samples. Gather positive and negative samples into matrix. + // Generate noise samples. Gather positive and negative samples into matrix. - start_timer(3); + start_timer(3); - minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1); - - for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++) - for (int train_id = 0; train_id < current_minibatch_size; train_id++) - minibatch_samples(sample_id, train_id) = unigram.sample(rng); - - stop_timer(3); + minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1); + + for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++) + for (int train_id = 0; train_id < current_minibatch_size; train_id++) + minibatch_samples(sample_id, train_id) = unigram.sample(rng); + + stop_timer(3); - // Final forward propagation step (sparse) - start_timer(4); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, - minibatch_samples, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, - minibatch_samples, scores); - stop_timer(4); + // Final forward propagation step (sparse) + start_timer(4); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, + minibatch_samples, scores); + stop_timer(4); - // Apply normalization parameters - if (myParam.normalization) - { - for (int train_id = 0;train_id < current_minibatch_size;train_id++) - { - Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1); - scores.col(train_id).array() += c_h[context]; - } - } + // Apply normalization parameters + if (myParam.normalization) + { + for (int train_id = 0;train_id < current_minibatch_size;train_id++) + { + Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1); + scores.col(train_id).array() += c_h[context]; + } + } - double minibatch_log_likelihood; - start_timer(5); - softmax_loss.fProp(scores.leftCols(current_minibatch_size), - minibatch_samples, - probs, minibatch_log_likelihood); - stop_timer(5); - log_likelihood += minibatch_log_likelihood; + double minibatch_log_likelihood; + start_timer(5); + softmax_loss.fProp(scores.leftCols(current_minibatch_size), + minibatch_samples, + probs, minibatch_log_likelihood); + stop_timer(5); + log_likelihood += minibatch_log_likelihood; + + ///// Backward propagation + + start_timer(6); + softmax_loss.bProp(probs, minibatch_weights); + stop_timer(6); + + // Update the normalization parameters + + if (myParam.normalization) + { + for (int train_id = 0;train_id < current_minibatch_size;train_id++) + { + Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1); + c_h[context] += adjusted_learning_rate * minibatch_weights.col(train_id).sum(); + } + } - ///// Backward propagation - - start_timer(6); - softmax_loss.bProp(probs, minibatch_weights); - stop_timer(6); - - // Update the normalization parameters - - if (myParam.normalization) - { - for (int train_id = 0;train_id < current_minibatch_size;train_id++) - { - Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1); - c_h[context] += adjusted_learning_rate * minibatch_weights.col(train_id).sum(); - } - } - - // Be careful of short minibatch - prop.bProp(minibatch.topRows(ngram_size-1), - minibatch_samples.leftCols(current_minibatch_size), - minibatch_weights.leftCols(current_minibatch_size), - adjusted_learning_rate, current_momentum, myParam.L2_reg); + // Be careful of short minibatch + prop.bProp(minibatch.topRows(ngram_size-1), + minibatch_samples.leftCols(current_minibatch_size), + minibatch_weights.leftCols(current_minibatch_size), + adjusted_learning_rate, + current_momentum, + myParam.L2_reg, + myParam.parameter_update, + myParam.conditioning_constant, + myParam.decay); } else if (loss_function == LogLoss) { - ///// Standard log-likelihood - start_timer(4); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - stop_timer(4); - - double minibatch_log_likelihood; - start_timer(5); - SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), - minibatch.row(ngram_size-1), - probs, - minibatch_log_likelihood); - stop_timer(5); - log_likelihood += minibatch_log_likelihood; - - ///// Backward propagation - - start_timer(6); - SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), - probs.leftCols(current_minibatch_size), - minibatch_weights); - stop_timer(6); - - prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size), - minibatch_weights, - adjusted_learning_rate, current_momentum, myParam.L2_reg); - } - } + ///// Standard log-likelihood + start_timer(4); + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + stop_timer(4); + + double minibatch_log_likelihood; + start_timer(5); + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + minibatch.row(ngram_size-1), + probs, + minibatch_log_likelihood); + stop_timer(5); + log_likelihood += minibatch_log_likelihood; + + ///// Backward propagation + + start_timer(6); + SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), + probs.leftCols(current_minibatch_size), + minibatch_weights); + stop_timer(6); + + prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size), + minibatch_weights, + adjusted_learning_rate, + current_momentum, + myParam.L2_reg, + myParam.parameter_update, + myParam.conditioning_constant, + myParam.decay); + } + } cerr << "done." << endl; if (loss_function == LogLoss) @@ -573,10 +743,7 @@ int main(int argc, char** argv) // Do full forward prop through output word embedding layer start_timer(4); - if (prop_validation.skip_hidden) - prop_validation.output_layer_node.param->fProp(prop_validation.first_hidden_activation_node.fProp_matrix, scores); - else - prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores); + prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores); stop_timer(4); // And softmax and loss. Be careful of short minibatch @@ -594,7 +761,7 @@ int main(int argc, char** argv) cerr << " perplexity: "<< exp(-log_likelihood/validation_data_size) << endl; // If the validation perplexity decreases, halve the learning rate. - if (epoch > 0 && log_likelihood < current_validation_ll) + if (epoch > 0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA") { current_learning_rate /= 2; } @@ -604,3 +771,4 @@ int main(int argc, char** argv) } return 0; } + diff --git a/src/types.hpp b/src/types.hpp new file mode 100644 index 0000000..08b010f --- /dev/null +++ b/src/types.hpp @@ -0,0 +1,41 @@ +#ifndef TYPES_HPP +#define TYPES_HPP + +#include <cmath> +#include <string> +#include <vector> +#include <boost/cstdint.hpp> +#include <limits> + +namespace biglm{ + +typedef double weight_type; +const weight_type IMPOSSIBLE = -HUGE_VAL; + +typedef unsigned long block_type; +const size_t bits_per_block = (std::numeric_limits<block_type>::digits); + //typedef std::size_t size_type; +typedef boost::uint64_t size_type; +typedef unsigned char byte_type; + +template<typename T> +struct bytes { + static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); } + static size_type size(const T& key) { return sizeof(T); } +}; + +template<> +struct bytes<std::string> { + static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); } + static size_type size(const std::string& key) { return key.size(); } +}; + +template<typename U> +struct bytes<std::vector<U> > { + static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); } + static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); } +}; + +} //namespace nplm + +#endif diff --git a/src/util.cpp b/src/util.cpp index fe022c9..f6a5779 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -2,6 +2,8 @@ #include <fstream> #include <iomanip> #include <cmath> +#include <deque> +#include <vector> #include <boost/unordered_map.hpp> #include <boost/algorithm/string.hpp> @@ -34,6 +36,21 @@ void splitBySpace(const std::string &line, std::vector<std::string> &items) boost::split(items, copy, boost::is_any_of(" \t"), boost::token_compress_on); } +void readWeightsFile(ifstream &TRAININ, vector<float> &weights) { + string line; + while (getline(TRAININ, line) && line != "") + { + vector<string> items; + splitBySpace(line, items); + if (items.size() != 1) + { + cerr << "Error: weights file should have only one weight per line" << endl; + exit(-1); + } + weights.push_back(boost::lexical_cast<float>(items[0])); + } +} + void readWordsFile(ifstream &TRAININ, vector<string> &word_list) { string line; @@ -87,28 +104,6 @@ void writeWordsFile(const vector<string> &words, const string &filename) OUT.close(); } -void readSentFile(const string &file, vector<vector<string> > &sentences) -{ - cerr << "Reading sentences from: " << file << endl; - - ifstream TRAININ; - TRAININ.open(file.c_str()); - if (! TRAININ) - { - cerr << "Error: can't read from file " << file<< endl; - exit(-1); - } - - string line; - while (getline(TRAININ, line)) - { - vector<string> words; - splitBySpace(line, words); - sentences.push_back(words); - } - - TRAININ.close(); -} // Read a data file of unknown size into a flat vector<int>. // If this takes too much memory, we should create a vector of minibatches. @@ -193,8 +188,7 @@ int setup_threads(int n_threads) Eigen::initParallel(); Eigen::setNbThreads(n_threads); - #ifdef __INTEL_MKL__ - /* + #ifdef MKL_SINGLE // Set the threading layer to match the compiler. // This lets MKL automatically go single-threaded in parallel regions. #ifdef __INTEL_COMPILER @@ -202,7 +196,6 @@ int setup_threads(int n_threads) #elif defined __GNUC__ mkl_set_threading_layer(MKL_THREADING_GNU); #endif - */ mkl_set_num_threads(n_threads); #endif #endif @@ -15,7 +15,7 @@ #include <boost/chrono.hpp> #endif -#include "../3rdparty/Eigen/Dense" +#include <Eigen/Dense> #include "maybe_omp.h" @@ -43,7 +43,39 @@ void writeWordsFile(const std::vector<std::string> &words, std::ofstream &file); void writeWordsFile(const std::vector<std::string> &words, const std::string &filename); void readDataFile(const std::string &filename, int &ngram_size, std::vector<int> &data, int minibatch_size=0); void readUnigramProbs(const std::string &unigram_probs_file, std::vector<double> &unigram_probs); -void readSentFile(const std::string &file, std::vector<std::vector<std::string> > &sentences); +void readWeightsFile(std::ifstream &TRAININ, std::vector<float> &weights); +//template <typename T> readSentFile(const std::string &file, T &sentences); + + +template <typename T> +void readSentFile(const std::string &file, T &sentences) +{ + std::cerr << "Reading sentences from: " << file << std::endl; + + std::ifstream TRAININ; + TRAININ.open(file.c_str()); + if (! TRAININ) + { + std::cerr << "Error: can't read from file " << file<< std::endl; + exit(-1); + } + + std::string line; + while (getline(TRAININ, line)) + { + std::vector<std::string> words; + splitBySpace(line, words); + sentences.push_back(words); + } + + TRAININ.close(); +} + +inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){ + int ngram_size = ngram.size(); + for (int i=0;i<ngram_size;i++) + int_ngram.push_back(boost::lexical_cast<int>(ngram[i])); +} // Functions that take non-const matrices as arguments // are supposed to declare them const and then use this @@ -84,6 +116,34 @@ void initMatrix(boost::random::mt19937 &engine, } template <typename Derived> +void initBias(boost::random::mt19937 &engine, + const Eigen::MatrixBase<Derived> &p_const, + bool init_normal, double range) +{ + UNCONST(Derived, p_const, p); + if (init_normal == 0) + // initialize with uniform distribution in [-range, range] + { + boost::random::uniform_real_distribution<> unif_real(-range, range); + for (int i = 0; i < p.size(); i++) + { + p(i) = unif_real(engine); + } + + } + else + // initialize with gaussian distribution with mean 0 and stdev range + { + boost::random::normal_distribution<double> unif_normal(0., range); + for (int i = 0; i < p.size(); i++) + { + p(i) = unif_normal(engine); + } + } +} + + +template <typename Derived> void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> ¶m_const) { UNCONST(Derived, param_const, param); diff --git a/src/vocabulary.h b/src/vocabulary.h index 1f844a7..11fbfcd 100644 --- a/src/vocabulary.h +++ b/src/vocabulary.h @@ -19,6 +19,7 @@ class vocabulary { std::vector<std::string> m_words; boost::unordered_map<std::string, int> m_index; int unk; + public: vocabulary() { @@ -40,7 +41,17 @@ public: if (pos != m_index.end()) return pos->second; else - return unk; + return unk; + } + + // lookup word using custom unknown-word id + int lookup_word(const std::string &word, int unk) const + { + boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word); + if (pos != m_index.end()) + return pos->second; + else + return unk; } int insert_word(const std::string &word) @@ -77,8 +88,6 @@ public: } const std::vector<std::string> &words() const { return m_words; } - - const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; } }; } // namespace nplm |