From cd9d683829f8670506dbc640712444ef9553fe6d Mon Sep 17 00:00:00 2001 From: graehl Date: Wed, 24 Jun 2015 19:12:39 -0700 Subject: c++11 compile --- src/Makefile | 4 ++-- src/model.cpp | 2 +- src/neuralTM.h | 2 +- src/trainNeuralNetwork.cpp | 2 +- src/util.h | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index 1611ccb..2a27405 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,12 +1,12 @@ ### Compilation options. # C++ compiler. Tested with g++ and Intel icpc. -CXX=/usr/bin/g++ +CXX=g++ #CXX=icpc # Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance! #CFLAGS=-g -CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG +CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG $(CXXFLAGS) # Architecture. Set to x86_64 or i686 to override. ARCH:=$(shell uname -m) diff --git a/src/model.cpp b/src/model.cpp index 3767f4b..919e005 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -44,7 +44,7 @@ void model::resize(int ngram_size, premultiplied = false; } -void model::initialize(mt19937 &init_engine, +void model::initialize(boost::random::mt19937 &init_engine, bool init_normal, double init_range, double init_bias, diff --git a/src/neuralTM.h b/src/neuralTM.h index 14bc7bf..4ad6752 100644 --- a/src/neuralTM.h +++ b/src/neuralTM.h @@ -6,7 +6,7 @@ #include #include -#include <../3rdparty/Eigen/Dense> +#include #include "util.h" #include "vocabulary.h" diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp index 97af03b..63ee27d 100644 --- a/src/trainNeuralNetwork.cpp +++ b/src/trainNeuralNetwork.cpp @@ -316,7 +316,7 @@ int main(int argc, char** argv) //unsigned seed = std::time(0); unsigned seed = 1234; //for testing only - mt19937 rng(seed); + boost::random::mt19937 rng(seed); /////////////////////////READING IN THE TRAINING AND VALIDATION DATA/////////////////// ///////////////////////////////////////////////////////////////////////////////////// diff --git a/src/util.h b/src/util.h index a8453aa..3b5e6aa 100644 --- a/src/util.h +++ b/src/util.h @@ -271,8 +271,8 @@ extern Timer timer; #define start_timer(x) timer.start(x) #define stop_timer(x) timer.stop(x) #else -#define start_timer(x) 0 -#define stop_timer(x) 0 +#define start_timer(x) (void)0 +#define stop_timer(x) (void)0 #endif int setup_threads(int n_threads); -- cgit v1.2.3 From 50308d573b90ff2814bd346210fc6929bd9b40af Mon Sep 17 00:00:00 2001 From: graehl Date: Wed, 24 Jun 2015 19:37:11 -0700 Subject: compile warnings --- src/Activation_function.h | 2 ++ src/prepareNeuralLM.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Activation_function.h b/src/Activation_function.h index 66342bb..138f9da 100644 --- a/src/Activation_function.h +++ b/src/Activation_function.h @@ -95,6 +95,7 @@ class Activation_function case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; case Tanh: my_output = input.unaryExpr(tanh_functor()); break; case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; + case InvalidFunction: std::abort(); } } @@ -112,6 +113,7 @@ class Activation_function case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; + case InvalidFunction: std::abort(); } } }; diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp index adedc72..a2cac7a 100644 --- a/src/prepareNeuralLM.cpp +++ b/src/prepareNeuralLM.cpp @@ -219,7 +219,7 @@ void writeMmapNgrams(const string &input_filename, //now to randomize the items if the randomize flag was set if (randomize == true) { unsigned seed = 1234; //for testing only - mt19937 rng(seed); + boost::random::mt19937 rng(seed); cerr<<"Randomly shuffling data..."; data_size_t counter =0; while (counter < num_tokens) { -- cgit v1.2.3 From 37e397f526fc207dea498356e890ad085a733ae8 Mon Sep 17 00:00:00 2001 From: graehl Date: Wed, 24 Jun 2015 23:22:21 -0700 Subject: fix mixed tab2/tab4/spaces indents --- src/Activation_function.h | 131 +++--- src/SoftmaxLoss.h | 159 ++++--- src/USCMatrix.h | 227 +++++----- src/graphClasses.h | 89 ++-- src/neuralClasses.h | 444 +++++++++---------- src/neuralLM.h | 189 ++++---- src/neuralNetwork.h | 319 +++++++------ src/prepareNeuralLM.cpp | 1057 ++++++++++++++++++++++---------------------- src/propagator.h | 641 ++++++++++++++------------- src/testNeuralLM.cpp | 279 ++++++------ src/trainNeuralNetwork.cpp | 227 +++++----- src/util.h | 277 ++++++------ 12 files changed, 2012 insertions(+), 2027 deletions(-) diff --git a/src/Activation_function.h b/src/Activation_function.h index 138f9da..742c2fc 100644 --- a/src/Activation_function.h +++ b/src/Activation_function.h @@ -3,7 +3,6 @@ #include #include -//#include <../3rdparty/Eigen/Dense> #include #include "util.h" @@ -19,28 +18,28 @@ enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunc inline activation_function_type string_to_activation_function (const std::string &s) { - if (s == "identity") - return Identity; - else if (s == "rectifier") - return Rectifier; - else if (s == "tanh") - return Tanh; - else if (s == "hardtanh") - return HardTanh; - else - return InvalidFunction; + if (s == "identity") + return Identity; + else if (s == "rectifier") + return Rectifier; + else if (s == "tanh") + return Tanh; + else if (s == "hardtanh") + return HardTanh; + else + return InvalidFunction; } inline std::string activation_function_to_string (activation_function_type f) { - if (f == Identity) - return "identity"; - else if (f == Rectifier) - return "rectifier"; - else if (f == Tanh) - return "tanh"; - else if (f == HardTanh) - return "hardtanh"; + if (f == Identity) + return "identity"; + else if (f == Rectifier) + return "rectifier"; + else if (f == Tanh) + return "tanh"; + else if (f == HardTanh) + return "hardtanh"; } struct hardtanh_functor { @@ -69,53 +68,53 @@ struct drectifier_functor { class Activation_function { - int size; - activation_function_type f; - - public: - Activation_function() : size(0), f(Rectifier) { } - - void resize(int size) { this->size = size; } - void set_activation_function(activation_function_type f) { this->f = f; } - - template - void initialize(Engine &engine, bool init_normal, double init_range) { } - - int n_inputs () const { return size; } - int n_outputs () const { return size; } - - template - void fProp(const MatrixBase &input, const MatrixBase &output) const - { - UNCONST(DerivedOut, output, my_output); - - switch (f) - { - case Identity: my_output = input; break; - case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; - case Tanh: my_output = input.unaryExpr(tanh_functor()); break; - case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; - case InvalidFunction: std::abort(); - } - } - - template - void bProp(const MatrixBase &input, - MatrixBase &output, - const MatrixBase &finput, - const MatrixBase &foutput) const - { - UNCONST(DerivedGIn, output, my_output); - - switch (f) - { - case Identity: my_output = input; break; - case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; - case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; - case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; - case InvalidFunction: std::abort(); - } - } + int size; + activation_function_type f; + + public: + Activation_function() : size(0), f(Rectifier) { } + + void resize(int size) { this->size = size; } + void set_activation_function(activation_function_type f) { this->f = f; } + + template + void initialize(Engine &engine, bool init_normal, double init_range) { } + + int n_inputs () const { return size; } + int n_outputs () const { return size; } + + template + void fProp(const MatrixBase &input, const MatrixBase &output) const + { + UNCONST(DerivedOut, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; + case Tanh: my_output = input.unaryExpr(tanh_functor()); break; + case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; + case InvalidFunction: std::abort(); + } + } + + template + void bProp(const MatrixBase &input, + MatrixBase &output, + const MatrixBase &finput, + const MatrixBase &foutput) const + { + UNCONST(DerivedGIn, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; + case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; + case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; + case InvalidFunction: std::abort(); + } + } }; } // namespace nplm diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h index bc55762..d89cde6 100644 --- a/src/SoftmaxLoss.h +++ b/src/SoftmaxLoss.h @@ -1,7 +1,6 @@ - #ifndef SOFTMAXLOSS_H +#ifndef SOFTMAXLOSS_H #define SOFTMAXLOSS_H -//#include <../3rdparty/Eigen/Dense> #include #include "multinomial.h" #include "util.h" @@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss }; inline loss_function_type string_to_loss_function (const std::string &s) { - if (s == "log") - return LogLoss; - else if (s == "nce") - return NCELoss; - else - return InvalidLoss; + if (s == "log") + return LogLoss; + else if (s == "nce") + return NCELoss; + else + return InvalidLoss; } inline std::string loss_function_to_string (loss_function_type f) { - if (f == LogLoss) - return "log"; - else if (f == NCELoss) - return "nce"; + if (f == LogLoss) + return "log"; + else if (f == NCELoss) + return "nce"; } /// Note: Outputs log-probabilities. struct SoftmaxLogLoss { - template - void fProp(const MatrixBase &input, const MatrixBase &output_words, const MatrixBase &output_const, double &loss) + template + void fProp(const MatrixBase &input, const MatrixBase &output_words, const MatrixBase &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + + double log_likelihood = 0.0; + +#pragma omp parallel for reduction(+:log_likelihood) + for (int train_id = 0; train_id < input.cols(); train_id++) { - UNCONST(DerivedO, output_const, output); - - double log_likelihood = 0.0; - - #pragma omp parallel for reduction(+:log_likelihood) - for (int train_id = 0; train_id < input.cols(); train_id++) - { - double normalization = logsum(input.col(train_id)); - output.col(train_id).array() = input.col(train_id).array() - normalization; - log_likelihood += output(output_words(train_id), train_id); - } - loss = log_likelihood; + double normalization = logsum(input.col(train_id)); + output.col(train_id).array() = input.col(train_id).array() - normalization; + log_likelihood += output(output_words(train_id), train_id); } - - template - void bProp(const MatrixBase &output_words, const MatrixBase &output, const MatrixBase &grad_input_const) + loss = log_likelihood; + } + + template + void bProp(const MatrixBase &output_words, const MatrixBase &output, const MatrixBase &grad_input_const) + { + UNCONST(DerivedI, grad_input_const, grad_input); + grad_input.setZero(); +#pragma omp parallel for + for (int train_id = 0; train_id < output.cols(); train_id++) { - UNCONST(DerivedI, grad_input_const, grad_input); - grad_input.setZero(); - #pragma omp parallel for - for (int train_id = 0; train_id < output.cols(); train_id++) - { - grad_input(output_words(train_id), train_id) += 1.; - grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); - } + grad_input(output_words(train_id), train_id) += 1.; + grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); } + } }; ///// Softmax layer plus NCE loss function. @@ -81,55 +80,55 @@ struct SoftmaxLogLoss template class SoftmaxNCELoss { - const Multinomial &unigram; + const Multinomial &unigram; -public: - SoftmaxNCELoss(const Multinomial &unigram) + public: + SoftmaxNCELoss(const Multinomial &unigram) : unigram(unigram) + { + } + + template + void fProp(const MatrixBase &scores, + const MatrixBase &minibatch_samples, + const MatrixBase &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + double log_likelihood = 0.0; + int num_noise_samples = minibatch_samples.rows()-1; + double log_num_noise_samples = std::log(num_noise_samples); +#pragma omp parallel for reduction(+:log_likelihood) schedule(static) + for (int train_id = 0; train_id < scores.cols(); train_id++) { + for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) + { + int sample = minibatch_samples(sample_id, train_id); + // To avoid zero or infinite probabilities, + // never take exp of score without normalizing first, + // even if it's a little slower... + double score = scores(sample_id, train_id); + double score_noise = log_num_noise_samples + unigram.logprob(sample); + double z = logadd(score, score_noise); + double logprob = score - z; + double logprob_noise = score_noise - z; + output(sample_id, train_id) = std::exp(logprob); + log_likelihood += sample_id == 0 ? logprob : logprob_noise; + } } - - template - void fProp(const MatrixBase &scores, - const MatrixBase &minibatch_samples, - const MatrixBase &output_const, double &loss) - { - UNCONST(DerivedO, output_const, output); - double log_likelihood = 0.0; - int num_noise_samples = minibatch_samples.rows()-1; - double log_num_noise_samples = std::log(num_noise_samples); - #pragma omp parallel for reduction(+:log_likelihood) schedule(static) - for (int train_id = 0; train_id < scores.cols(); train_id++) - { - for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) - { - int sample = minibatch_samples(sample_id, train_id); - // To avoid zero or infinite probabilities, - // never take exp of score without normalizing first, - // even if it's a little slower... - double score = scores(sample_id, train_id); - double score_noise = log_num_noise_samples + unigram.logprob(sample); - double z = logadd(score, score_noise); - double logprob = score - z; - double logprob_noise = score_noise - z; - output(sample_id, train_id) = std::exp(logprob); - log_likelihood += sample_id == 0 ? logprob : logprob_noise; - } - } - loss = log_likelihood; - } - - template - void bProp(const MatrixBase &probs, const MatrixBase &output_const) + loss = log_likelihood; + } + + template + void bProp(const MatrixBase &probs, const MatrixBase &output_const) + { + UNCONST(DerivedI, output_const, output); +#pragma omp parallel for schedule(static) + for (int train_id = 0; train_id < probs.cols(); train_id++) { - UNCONST(DerivedI, output_const, output); - #pragma omp parallel for schedule(static) - for (int train_id = 0; train_id < probs.cols(); train_id++) - { - output.col(train_id) = -probs.col(train_id); - output(0, train_id) += 1.0; - } + output.col(train_id) = -probs.col(train_id); + output(0, train_id) += 1.0; } + } }; } // namespace nplm diff --git a/src/USCMatrix.h b/src/USCMatrix.h index 02aeb33..784fa1b 100644 --- a/src/USCMatrix.h +++ b/src/USCMatrix.h @@ -1,7 +1,6 @@ #ifndef USCMATRIX_H #define USCMATRIX_H -//#include <../3rdparty/Eigen/Dense> #include #include "maybe_omp.h" #include "util.h" @@ -34,108 +33,108 @@ template // should be EIGEN_DEFAULT_DENSE_ class USCMatrix { -public: - Matrix indexes; - Matrix values; - int m_rows; + public: + Matrix indexes; + Matrix values; + int m_rows; - USCMatrix() : m_rows(0) { } + USCMatrix() : m_rows(0) { } - template - USCMatrix(Index rows, const MatrixBase &indexes, const MatrixBase &values) - : - indexes(indexes), - values(values), - m_rows(rows) - { } + template + USCMatrix(Index rows, const MatrixBase &indexes, const MatrixBase &values) + : + indexes(indexes), + values(values), + m_rows(rows) + { } - USCMatrix(Index rows, Index nnz, Index cols) - : - indexes(Matrix(nnz, cols)), + USCMatrix(Index rows, Index nnz, Index cols) + : + indexes(Matrix(nnz, cols)), values(Matrix(nnz, cols)), m_rows(rows) - { - this->indexes.fill(-1); - } - - Index rows() const { return m_rows; } - Index cols() const { return indexes.cols(); } - - void resize(Index rows, Index nnz, Index cols) { - indexes.resize(nnz, cols); - values.resize(nnz, cols); - m_rows = rows; - } + { + this->indexes.fill(-1); + } + + Index rows() const { return m_rows; } + Index cols() const { return indexes.cols(); } + + void resize(Index rows, Index nnz, Index cols) { + indexes.resize(nnz, cols); + values.resize(nnz, cols); + m_rows = rows; + } }; // Dense matrix - sparse matrix product // a is presumably very wide template -void uscgemm(double alpha, const MatrixBase &a, - const USCMatrix &b, - const MatrixBase &c_const) +void uscgemm(double alpha, const MatrixBase &a, + const USCMatrix &b, + const MatrixBase &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - #pragma omp parallel for - for (Index k=0; k= 0); - eigen_assert(j < a.cols()); - c.col(k) += alpha * a.col(j) * b.values(r,k); - } + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + +#pragma omp parallel for + for (Index k=0; k= 0); + eigen_assert(j < a.cols()); + c.col(k) += alpha * a.col(j) * b.values(r,k); + } } // sparse matrix - dense matrix product template -void uscgemm(double alpha, - const USCMatrix &a, - const MatrixBase &b, - const MatrixBase &c_const) +void uscgemm(double alpha, + const USCMatrix &a, + const MatrixBase &b, + const MatrixBase &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - // This needs to be tuned for each system, unfortunately, - // and seems to vary a lot. A lot. - int i_blocks = omp_get_num_threads()*16; - - // Assume only one block in k direction. - // We don't need to explicitly block in the j direction. - #pragma omp parallel for - for (Index ib=0; ib= 0); - eigen_assert(i < c.rows()); - if (i % i_blocks == ib) - c.row(i) += alpha * a.values(r,j) * b.row(j); - } - - /* + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + + // This needs to be tuned for each system, unfortunately, + // and seems to vary a lot. A lot. + int i_blocks = omp_get_num_threads()*16; + + // Assume only one block in k direction. + // We don't need to explicitly block in the j direction. +#pragma omp parallel for + for (Index ib=0; ib= 0); + eigen_assert(i < c.rows()); + if (i % i_blocks == ib) + c.row(i) += alpha * a.values(r,j) * b.row(j); + } + + /* If c.cols() is really large, then theoretically it seems like we should do: parallel for blocks in i direction - for blocks in j direction - pack block of a into smaller sparse matrix - for blocks in k direction - for k - for i (sparse) - for j - c(i,k) += a(i,j) * b(j,k) + for blocks in j direction + pack block of a into smaller sparse matrix + for blocks in k direction + for k + for i (sparse) + for j + c(i,k) += a(i,j) * b(j,k) However, the copying of blocks of a doesn't seem practical for any realistic sizes of c.cols(). - */ + */ } // Dense matrix - dense matrix product, but masked by a sparse matrix, @@ -147,45 +146,45 @@ void uscgemm(double alpha, template void uscgemm_masked(double alpha, - const MatrixBase &a, - const MatrixBase &b, - USCMatrix &c) + const MatrixBase &a, + const MatrixBase &b, + USCMatrix &c) { - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - #pragma omp parallel for - for (Index k=0; k= 0); - eigen_assert(i < a.rows()); - c.values(r, k) += alpha * a.row(i) * b.col(k); - } + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + +#pragma omp parallel for + for (Index k=0; k= 0); + eigen_assert(i < a.rows()); + c.values(r, k) += alpha * a.row(i) * b.col(k); + } } // sparse matrix - dense vector product template -void uscgemv(double alpha, - const USCMatrix &a, - const MatrixBase &b, - const MatrixBase &c_const) +void uscgemv(double alpha, + const USCMatrix &a, + const MatrixBase &b, + const MatrixBase &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == 1 && c.cols() == 1); - - for (Index j=0; j= 0); - eigen_assert(i < c.rows()); - c(i) += alpha * a.values(r,j) * b(j); - } + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == 1 && c.cols() == 1); + + for (Index j=0; j= 0); + eigen_assert(i < c.rows()); + c(i) += alpha * a.values(r,j) * b(j); + } } } diff --git a/src/graphClasses.h b/src/graphClasses.h index d3c0c4a..cd80a4c 100644 --- a/src/graphClasses.h +++ b/src/graphClasses.h @@ -3,7 +3,6 @@ #include #include "neuralClasses.h" -//#include <../3rdparty/Eigen/Dense> #include namespace nplm @@ -11,50 +10,50 @@ namespace nplm template class Node { - public: - X * param; //what parameter is this - //vector children; - //vector parents; - Eigen::Matrix fProp_matrix; - Eigen::Matrix bProp_matrix; - int minibatch_size; - - public: - Node() : param(NULL), minibatch_size(0) { } - - Node(X *input_param, int minibatch_size) - : param(input_param), - minibatch_size(minibatch_size) - { - resize(minibatch_size); - } - - void resize(int minibatch_size) - { - this->minibatch_size = minibatch_size; - if (param->n_outputs() != -1) - { - fProp_matrix.setZero(param->n_outputs(), minibatch_size); - } - if (param->n_inputs() != -1) - { - bProp_matrix.setZero(param->n_inputs(), minibatch_size); - } - } - - void resize() { resize(minibatch_size); } - - /* - void Fprop(Matrix & input,int n_cols) - { - param->fProp(input,fProp_matrix,0,0,n_cols); - } - void Fprop(Matrix & input,int n_cols) - { - param->fProp(input,fProp_matrix,0,0,n_cols); - } - */ - //for f prop, just call the fProp node of the particular parameter. + public: + X * param; //what parameter is this + //vector children; + //vector parents; + Eigen::Matrix fProp_matrix; + Eigen::Matrix bProp_matrix; + int minibatch_size; + + public: + Node() : param(NULL), minibatch_size(0) { } + + Node(X *input_param, int minibatch_size) + : param(input_param), + minibatch_size(minibatch_size) + { + resize(minibatch_size); + } + + void resize(int minibatch_size) + { + this->minibatch_size = minibatch_size; + if (param->n_outputs() != -1) + { + fProp_matrix.setZero(param->n_outputs(), minibatch_size); + } + if (param->n_inputs() != -1) + { + bProp_matrix.setZero(param->n_inputs(), minibatch_size); + } + } + + void resize() { resize(minibatch_size); } + + /* + void Fprop(Matrix & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + void Fprop(Matrix & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + */ + //for f prop, just call the fProp node of the particular parameter. }; diff --git a/src/neuralClasses.h b/src/neuralClasses.h index 26dae06..ee7c3f0 100644 --- a/src/neuralClasses.h +++ b/src/neuralClasses.h @@ -6,8 +6,7 @@ #include #include -#include -//#include <../3rdparty/Eigen/Dense> +#include #include #include "maybe_omp.h" @@ -35,7 +34,7 @@ using Eigen::Dynamic; typedef boost::unordered_map int_map; struct Clipper{ - double operator() (double x) const { + double operator() (double x) const { return std::min(0.5, std::max(x,-0.5)); //return(x); } @@ -44,7 +43,7 @@ struct Clipper{ class Linear_layer { - private: + private: Matrix U; Matrix U_gradient; Matrix U_velocity; @@ -60,12 +59,12 @@ class Linear_layer friend class model; public: - Linear_layer() { } + Linear_layer() { } Linear_layer(int rows, int cols) { resize(rows, cols); } - void resize(int rows, int cols) - { - U.setZero(rows, cols); + void resize(int rows, int cols) + { + U.setZero(rows, cols); U_gradient.setZero(rows, cols); //U_running_gradient.setZero(rows, cols); //U_running_parameter_updates.setZero(rows, cols); @@ -74,21 +73,21 @@ class Linear_layer b_gradient.setZero(rows); //b_running_gradient.resize(rows); //b_velocity.resize(rows); - } + } - void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } - void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } + void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } + void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } - template - void initialize(Engine &engine, + template + void initialize(Engine &engine, bool init_normal, double init_range, string ¶meter_update, double adagrad_epsilon) - { + { if (parameter_update == "ADA") { U_running_gradient = Matrix::Ones(U.rows(),U.cols())*adagrad_epsilon; b_running_gradient = Matrix::Ones(b.size())*adagrad_epsilon; @@ -100,58 +99,58 @@ class Linear_layer b_running_parameter_update.setZero(b.size()); } - initMatrix(engine, U, init_normal, init_range); + initMatrix(engine, U, init_normal, init_range); initBias(engine, b, init_normal, init_range); - } + } - int n_inputs () const { return U.cols(); } - int n_outputs () const { return U.rows(); } + int n_inputs () const { return U.cols(); } + int n_outputs () const { return U.rows(); } template - void fProp(const MatrixBase &input, + void fProp(const MatrixBase &input, const MatrixBase &output) const { UNCONST(DerivedOut, output, my_output); my_output.leftCols(input.cols()).noalias() = U*input; int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) + for (int example = 0;example < num_examples;example++) { my_output.leftCols(input.cols()).col(example) += b; } } - // Sparse input + // Sparse input template - void fProp(const USCMatrix &input, + void fProp(const USCMatrix &input, const MatrixBase &output_const) const { - UNCONST(DerivedOut, output_const, output); - output.setZero(); - uscgemm(1.0, U, input, output.leftCols(input.cols())); - // Each column corresponds to a training example. We + UNCONST(DerivedOut, output_const, output); + output.setZero(); + uscgemm(1.0, U, input, output.leftCols(input.cols())); + // Each column corresponds to a training example. We // parallelize the adding of biases per dimension. int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) + for (int example = 0;example < num_examples;example++) { output.leftCols(input.cols()).col(example) += b; } } template - void bProp(const MatrixBase &input, + void bProp(const MatrixBase &input, MatrixBase &output) const { - UNCONST(DerivedGIn, output, my_output); - my_output.noalias() = U.transpose()*input; - } + UNCONST(DerivedGIn, output, my_output); + my_output.noalias() = U.transpose()*input; + } template - void computeGradient( const MatrixBase &bProp_input, - const MatrixBase &fProp_input, + void computeGradient( const MatrixBase &bProp_input, + const MatrixBase &fProp_input, double learning_rate, double momentum, double L2_reg) { U_gradient.noalias() = bProp_input*fProp_input.transpose(); - + // get the bias gradient for all dimensions in parallel int size = b.size(); b_gradient = bProp_input.rowwise().sum(); @@ -172,7 +171,7 @@ class Linear_layer { U += learning_rate * U_gradient; b += learning_rate * b_gradient; - /* + /* //UPDATE CLIPPING U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); @@ -181,17 +180,17 @@ class Linear_layer //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); */ } - } + } template - void computeGradientAdagrad(const MatrixBase &bProp_input, - const MatrixBase &fProp_input, + void computeGradientAdagrad(const MatrixBase &bProp_input, + const MatrixBase &fProp_input, double learning_rate, double L2_reg) { U_gradient.noalias() = bProp_input*fProp_input.transpose(); - + // get the bias gradient for all dimensions in parallel int size = b.size(); b_gradient.noalias() = bProp_input.rowwise().sum(); @@ -206,7 +205,7 @@ class Linear_layer #pragma omp parallel for for (int col=0; col - void computeGradientAdadelta(const MatrixBase &bProp_input, - const MatrixBase &fProp_input, + void computeGradientAdadelta(const MatrixBase &bProp_input, + const MatrixBase &fProp_input, double learning_rate, double L2_reg, double conditioning_constant, @@ -234,7 +233,7 @@ class Linear_layer U_gradient.noalias() = bProp_input*fProp_input.transpose(); Array b_current_parameter_update; - + // get the bias gradient for all dimensions in parallel int size = b.size(); b_gradient.noalias() = bProp_input.rowwise().sum(); @@ -250,7 +249,7 @@ class Linear_layer //cerr<<"U gradient is "< U_current_parameter_update; - U_running_gradient.col(col) = decay*U_running_gradient.col(col) + + U_running_gradient.col(col) = decay*U_running_gradient.col(col) + (1-decay)*U_gradient.col(col).array().square().matrix(); //cerr<<"U running gradient is "< - void computeGradientCheck(const MatrixBase &bProp_input, - const MatrixBase &fProp_input, + void computeGradientCheck(const MatrixBase &bProp_input, + const MatrixBase &fProp_input, const MatrixBase &gradient) const { UNCONST(DerivedGW, gradient, my_gradient); @@ -355,17 +354,17 @@ class Output_word_embeddings template void fProp(const MatrixBase &input, const MatrixBase &output) const - { + { UNCONST(DerivedOut, output, my_output); my_output = ((*W) * input).colwise() + b; - } + } - // Sparse output version + // Sparse output version template void fProp(const MatrixBase &input, const MatrixBase &samples, const MatrixBase &output) const - { + { UNCONST(DerivedOutV, output, my_output); #pragma omp parallel for for (int instance_id = 0; instance_id < samples.cols(); instance_id++) @@ -378,13 +377,13 @@ class Output_word_embeddings USCMatrix sparse_output(W->rows(), samples, my_output); uscgemm_masked(1.0, *W, input, sparse_output); my_output = sparse_output.values; // too bad, so much copying - } + } // Return single element of output matrix template - double fProp(const MatrixBase &input, + double fProp(const MatrixBase &input, int word, - int instance) const + int instance) const { return W->row(word).dot(input.col(instance)) + b(word); } @@ -395,19 +394,19 @@ class Output_word_embeddings void bProp(const MatrixBase &input_bProp_matrix, const MatrixBase &bProp_matrix) const { - // W is vocab_size x output_embedding_dimension - // input_bProp_matrix is vocab_size x minibatch_size - // bProp_matrix is output_embedding_dimension x minibatch_size - UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); - my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = + // W is vocab_size x output_embedding_dimension + // input_bProp_matrix is vocab_size x minibatch_size + // bProp_matrix is output_embedding_dimension x minibatch_size + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = W->transpose() * input_bProp_matrix; - } + } template void computeGradient(const MatrixBase &predicted_embeddings, const MatrixBase &bProp_input, double learning_rate, - double momentum) //not sure if we want to use momentum here + double momentum) //not sure if we want to use momentum here { // W is vocab_size x output_embedding_dimension // b is vocab_size x 1 @@ -418,15 +417,15 @@ class Output_word_embeddings /* //GRADIENT CLIPPING - W->noalias() += learning_rate * + W->noalias() += learning_rate * ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); //UPDATE CLIPPING - W->noalias() += (learning_rate * + W->noalias() += (learning_rate * (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); */ - } + } template void computeGradientAdagrad( @@ -451,7 +450,7 @@ class Output_word_embeddings *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); */ - } + } template void computeGradientAdadelta(const MatrixBase &predicted_embeddings, @@ -480,14 +479,14 @@ class Output_word_embeddings b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ (b_running_gradient.array()+conditioning_constant).sqrt())* b_gradient.array(); - W_running_parameter_update = decay*W_running_parameter_update + + W_running_parameter_update = decay*W_running_parameter_update + (1.-decay)*W_current_parameter_update.square().matrix(); b_running_parameter_update = decay*b_running_parameter_update + (1.-decay)*b_current_parameter_update.square().matrix(); *W += learning_rate*W_current_parameter_update.matrix(); b += learning_rate*b_current_parameter_update.matrix(); - } + } // Sparse versions @@ -499,46 +498,46 @@ class Output_word_embeddings UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); my_bProp_matrix.setZero(); uscgemm(1.0, - W->transpose(), + W->transpose(), USCMatrix(W->rows(), samples, weights), my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch } - template + template void computeGradient(const MatrixBase &predicted_embeddings, - const MatrixBase &samples, - const MatrixBase &weights, - double learning_rate, double momentum) //not sure if we want to use momentum here - { + const MatrixBase &samples, + const MatrixBase &weights, + double learning_rate, double momentum) //not sure if we want to use momentum here + { //cerr<<"in gradient"< gradient_output(W->rows(), samples, weights); - uscgemm(learning_rate, + USCMatrix gradient_output(W->rows(), samples, weights); + uscgemm(learning_rate, gradient_output, predicted_embeddings.leftCols(gradient_output.cols()).transpose(), *W); // narrow predicted_embeddings for possible short minibatch - uscgemv(learning_rate, + uscgemv(learning_rate, gradient_output, - Matrix::Ones(gradient_output.cols()), + Matrix::Ones(gradient_output.cols()), b); /* //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT //FIRST - USCMatrix gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + USCMatrix gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(1.0, + uscgemv(1.0, gradient_output, - Matrix::Ones(weights.cols()), + Matrix::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated for (int sample_id=0; sample_id update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) update_items.push_back(it->first); @@ -560,33 +559,33 @@ class Output_word_embeddings } */ //cerr<<"Finished gradient"< + template void computeGradientAdagrad(const MatrixBase &predicted_embeddings, - const MatrixBase &samples, - const MatrixBase &weights, - double learning_rate) //not sure if we want to use momentum here + const MatrixBase &samples, + const MatrixBase &weights, + double learning_rate) //not sure if we want to use momentum here { - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE - USCMatrix gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + USCMatrix gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(1.0, + uscgemv(1.0, gradient_output, - Matrix::Ones(weights.cols()), + Matrix::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated for (int sample_id=0; sample_id update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) update_items.push_back(it->first); @@ -611,34 +610,34 @@ class Output_word_embeddings } } - template + template void computeGradientAdadelta(const MatrixBase &predicted_embeddings, - const MatrixBase &samples, - const MatrixBase &weights, - double learning_rate, + const MatrixBase &samples, + const MatrixBase &weights, + double learning_rate, double conditioning_constant, double decay) //not sure if we want to use momentum here { //cerr<<"decay is "<rows(), W->cols()); - //b_gradient.setZero(b.size()); + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); - USCMatrix gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + USCMatrix gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(1.0, + uscgemv(1.0, gradient_output, - Matrix::Ones(weights.cols()), + Matrix::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated for (int sample_id=0; sample_id update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) update_items.push_back(it->first); @@ -685,24 +684,24 @@ class Output_word_embeddings } - template + template void computeGradientCheck(const MatrixBase &predicted_embeddings, const MatrixBase &samples, const MatrixBase &weights, const MatrixBase &gradient_W, const MatrixBase &gradient_b) const { - UNCONST(DerivedGW, gradient_W, my_gradient_W); - UNCONST(DerivedGb, gradient_b, my_gradient_b); - my_gradient_W.setZero(); - my_gradient_b.setZero(); - USCMatrix gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + UNCONST(DerivedGW, gradient_W, my_gradient_W); + UNCONST(DerivedGb, gradient_b, my_gradient_b); + my_gradient_W.setZero(); + my_gradient_b.setZero(); + USCMatrix gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), my_gradient_W); - uscgemv(1.0, gradient_output, - Matrix::Ones(weights.cols()), my_gradient_b); + uscgemv(1.0, gradient_output, + Matrix::Ones(weights.cols()), my_gradient_b); } }; @@ -715,12 +714,12 @@ class Input_word_embeddings Matrix W_running_parameter_update; Matrix W_gradient; - friend class model; + friend class model; public: Input_word_embeddings() : context_size(0), vocab_size(0) { } Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - + void set_W(Matrix *input_W) { W = input_W; } @@ -747,7 +746,7 @@ class Input_word_embeddings if (parameter_update == "ADA") { W_running_gradient = Matrix::Ones(W->rows(),W->cols())*adagrad_epsilon; //W_gradient.setZero(W->rows(),W->cols()); - } + } if (parameter_update == "ADAD") { W_running_gradient.setZero(W->rows(),W->cols()); //W_gradient.setZero(W->rows(),W->cols()); @@ -759,59 +758,59 @@ class Input_word_embeddings init_range); } - int n_inputs() const { return -1; } - int n_outputs() const { return W->cols() * context_size; } + int n_inputs() const { return -1; } + int n_outputs() const { return W->cols() * context_size; } - // set output_id's embedding to the weighted average of all embeddings - template - void average(const Dist &dist, int output_id) - { - W->row(output_id).setZero(); - for (int i=0; i < W->rows(); i++) - if (i != output_id) - W->row(output_id) += dist.prob(i) * W->row(i); - } + // set output_id's embedding to the weighted average of all embeddings + template + void average(const Dist &dist, int output_id) + { + W->row(output_id).setZero(); + for (int i=0; i < W->rows(); i++) + if (i != output_id) + W->row(output_id) += dist.prob(i) * W->row(i); + } - template + template void fProp(const MatrixBase &input, - const MatrixBase &output) const + const MatrixBase &output) const { int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // output is ngram_size*embedding_dimension x minibatch_size - - /* - // Dense version: - for (int ngram=0; ngramtranspose(), + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // output is ngram_size*embedding_dimension x minibatch_size + + /* + // Dense version: + for (int ngram=0; ngramtranspose(), USCMatrix(W->rows(),input.middleRows(ngram, 1),Matrix::Ones(input.cols())), my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); - } + } } - // When model is premultiplied, this layer doesn't get used, - // but this method is used to get the input into a sparse matrix. - // Hopefully this can get eliminated someday - template - void munge(const MatrixBase &input, USCMatrix &output) const - { - output.resize(vocab_size*context_size, context_size, input.cols()); - for (int i=0; i < context_size; i++) - output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; - output.values.fill(1.0); - } + // When model is premultiplied, this layer doesn't get used, + // but this method is used to get the input into a sparse matrix. + // Hopefully this can get eliminated someday + template + void munge(const MatrixBase &input, USCMatrix &output) const + { + output.resize(vocab_size*context_size, context_size, input.cols()); + for (int i=0; i < context_size; i++) + output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; + output.values.fill(1.0); + } template void computeGradient(const MatrixBase &bProp_input, @@ -820,45 +819,45 @@ class Input_word_embeddings { int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // bProp_input is ngram_size*embedding_dimension x minibatch_size + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // bProp_input is ngram_size*embedding_dimension x minibatch_size - /* - // Dense version: - for (int ngram=0; ngram(W->rows(), input_words.middleRows(ngram, 1), Matrix::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), - *W); - } + for (int ngram=0; ngram(W->rows(), input_words.middleRows(ngram, 1), Matrix::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), + *W); + } /* //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN //PERFORM CLIPPING WHILE UPDATING - for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); - } + } int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { @@ -884,33 +883,33 @@ class Input_word_embeddings template void computeGradientAdagrad(const MatrixBase &bProp_input, - const MatrixBase &input_words, - double learning_rate, + const MatrixBase &input_words, + double learning_rate, double L2_reg) { int embedding_dimension = W->cols(); - //W_gradient.setZero(W->rows(), W->cols()); + //W_gradient.setZero(W->rows(), W->cols()); /* if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; */ - for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); - } + } int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { @@ -923,11 +922,11 @@ class Input_word_embeddings { int update_item = update_items[item_id]; W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); - W->row(update_item) += learning_rate * + W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); /* //UPDATE CLIPPING - W->row(update_item) += (learning_rate * + W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) .unaryExpr(Clipper()).matrix(); */ @@ -937,36 +936,36 @@ class Input_word_embeddings template void computeGradientAdadelta(const MatrixBase &bProp_input, - const MatrixBase &input_words, - double learning_rate, + const MatrixBase &input_words, + double learning_rate, double L2_reg, double conditioning_constant, double decay) { int embedding_dimension = W->cols(); - //W_gradient.setZero(W->rows(), W->cols()); + //W_gradient.setZero(W->rows(), W->cols()); /* if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; */ - for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); - } + } int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { @@ -1006,16 +1005,15 @@ class Input_word_embeddings int x, int minibatch_size, const MatrixBase &gradient) const //not sure if we want to use momentum here { - UNCONST(DerivedGW, gradient, my_gradient); + UNCONST(DerivedGW, gradient, my_gradient); int embedding_dimension = W->cols(); - my_gradient.setZero(); - for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + my_gradient.setZero(); + for (int ngram=0; ngram(W->rows(),input_words.middleRows(ngram, 1),Matrix::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), my_gradient); } }; } // namespace nplm - diff --git a/src/neuralLM.h b/src/neuralLM.h index 2004596..c18485f 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -6,7 +6,6 @@ #include #include -//#include <../3rdparty/Eigen/Dense> #include #include "util.h" @@ -16,109 +15,109 @@ /* To do: - move digit mapping into vocabulary.h - */ +*/ namespace nplm { class neuralLM : public neuralNetwork { - char map_digits; - boost::shared_ptr vocab; - int start, null; + char map_digits; + boost::shared_ptr vocab; + int start, null; -public: - neuralLM() + public: + neuralLM() : neuralNetwork(), vocab(new vocabulary()), - map_digits(0) - { - } + map_digits(0) + { + } - void set_map_digits(char value) { map_digits = value; } + void set_map_digits(char value) { map_digits = value; } - void set_vocabulary(const vocabulary &vocab) - { - *(this->vocab) = vocab; - start = vocab.lookup_word(""); - null = vocab.lookup_word(""); - } + void set_vocabulary(const vocabulary &vocab) + { + *(this->vocab) = vocab; + start = vocab.lookup_word(""); + null = vocab.lookup_word(""); + } - const vocabulary &get_vocabulary() const { return *(this->vocab); } + const vocabulary &get_vocabulary() const { return *(this->vocab); } - int lookup_word(const std::string &word) const - { - if (map_digits) - for (int i=0; ilookup_word(mapped_word); - } - return vocab->lookup_word(word); - } + int lookup_word(const std::string &word) const + { + if (map_digits) + for (int i=0; ilookup_word(mapped_word); + } + return vocab->lookup_word(word); + } - double lookup_ngram(const int *ngram_a, int n) + double lookup_ngram(const int *ngram_a, int n) + { + Eigen::Matrix ngram(m->ngram_size); + for (int i=0; ingram_size; i++) { - Eigen::Matrix ngram(m->ngram_size); - for (int i=0; ingram_size; i++) - { - if (i-m->ngram_size+n < 0) - { - if (ngram_a[0] == start) - ngram(i) = start; - else - ngram(i) = null; - } - else - { - ngram(i) = ngram_a[i-m->ngram_size+n]; - } - } - return neuralNetwork::lookup_ngram(ngram); + if (i-m->ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-m->ngram_size+n]; + } } + return neuralNetwork::lookup_ngram(ngram); + } - double lookup_ngram(const std::vector &ngram_v) - { - return lookup_ngram(ngram_v.data(), ngram_v.size()); - } + double lookup_ngram(const std::vector &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } - template - double lookup_ngram(const Eigen::MatrixBase &ngram) - { - return neuralNetwork::lookup_ngram(ngram); - } - - template - void lookup_ngram(const Eigen::MatrixBase &ngram, const Eigen::MatrixBase &log_probs_const) - { - return neuralNetwork::lookup_ngram(ngram, log_probs_const); - } + template + double lookup_ngram(const Eigen::MatrixBase &ngram) + { + return neuralNetwork::lookup_ngram(ngram); + } - void read(const std::string &filename) - { - std::vector words; - m->read(filename, words); - set_vocabulary(vocabulary(words)); - resize(); - // this is faster but takes more memory - //m->premultiply(); - } + template + void lookup_ngram(const Eigen::MatrixBase &ngram, const Eigen::MatrixBase &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); + } + + void read(const std::string &filename) + { + std::vector words; + m->read(filename, words); + set_vocabulary(vocabulary(words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } }; template void addStartStop(std::vector &input, std::vector &output, int ngram_size, const T &start, const T &stop) { - output.clear(); - output.resize(input.size()+ngram_size); - for (int i=0; i @@ -127,21 +126,21 @@ void makeNgrams(const std::vector &input, std::vector > &outpu output.clear(); for (int j=ngram_size-1; j ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); - output.push_back(ngram); + std::vector ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); + output.push_back(ngram); } } -inline void preprocessWords(const std::vector &words, - std::vector< std::vector > &ngrams, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize) { +inline void preprocessWords(const std::vector &words, + std::vector< std::vector > &ngrams, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize) { int start = vocab.lookup_word(""); int stop = vocab.lookup_word(""); - + // convert words to ints std::vector nums; if (numberize) { @@ -152,9 +151,9 @@ inline void preprocessWords(const std::vector &words, else { for (int j=0; j(words[j])); - } + } } - + // convert sequence to n-grams ngrams.clear(); if (ngramize) { @@ -168,10 +167,10 @@ inline void preprocessWords(const std::vector &words, } else { if (nums.size() != ngram_size) - { - std::cerr << "error: wrong number of fields in line" << std::endl; - std::exit(1); - } + { + std::cerr << "error: wrong number of fields in line" << std::endl; + std::exit(1); + } ngrams.push_back(nums); } } diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h index ef96488..6386a0f 100644 --- a/src/neuralNetwork.h +++ b/src/neuralNetwork.h @@ -3,7 +3,6 @@ #include #include -//#include <../3rdparty/Eigen/Dense> #include #include "util.h" @@ -16,191 +15,191 @@ namespace nplm class neuralNetwork { -protected: - boost::shared_ptr m; + protected: + boost::shared_ptr m; -private: - bool normalization; - double weight; + private: + bool normalization; + double weight; - propagator prop; + propagator prop; - std::size_t cache_size; - Eigen::Matrix cache_keys; - std::vector cache_values; - int cache_lookups, cache_hits; + std::size_t cache_size; + Eigen::Matrix cache_keys; + std::vector cache_values; + int cache_lookups, cache_hits; -public: - neuralNetwork() + public: + neuralNetwork() : m(new model()), normalization(false), - weight(1.), - prop(*m, 1), + weight(1.), + prop(*m, 1), cache_size(0) - { - } + { + } - void set_normalization(bool value) { normalization = value; } - void set_log_base(double value) { weight = 1./std::log(value); } - - // This must be called if the underlying model is resized. - void resize() { - if (cache_size) - { - cache_keys.resize(m->ngram_size, cache_size); - cache_keys.fill(-1); - } - prop.resize(); - } + void set_normalization(bool value) { normalization = value; } + void set_log_base(double value) { weight = 1./std::log(value); } - void set_width(int width) + // This must be called if the underlying model is resized. + void resize() { + if (cache_size) { - prop.resize(width); + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); } - - template - double lookup_ngram(const Eigen::MatrixBase &ngram) + prop.resize(); + } + + void set_width(int width) + { + prop.resize(width); + } + + template + double lookup_ngram(const Eigen::MatrixBase &ngram) + { + assert (ngram.rows() == m->ngram_size); + assert (ngram.cols() == 1); + + std::size_t hash; + if (cache_size) { - assert (ngram.rows() == m->ngram_size); - assert (ngram.cols() == 1); - - std::size_t hash; - if (cache_size) - { - // First look in cache - hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h - cache_lookups++; - if (cache_keys.col(hash) == ngram) - { - cache_hits++; - return cache_values[hash]; - } - } - - // Make sure that we're single threaded. Multithreading doesn't help, - // and in some cases can hurt quite a lot - int save_threads = omp_get_max_threads(); - omp_set_num_threads(1); - int save_eigen_threads = Eigen::nbThreads(); - Eigen::setNbThreads(1); - #ifdef __INTEL_MKL__ - int save_mkl_threads = mkl_get_max_threads(); - mkl_set_num_threads(1); - #endif - - prop.fProp(ngram.col(0)); - - int output = ngram(m->ngram_size-1, 0); - double log_prob; - - start_timer(3); - if (normalization) - { - Eigen::Matrix scores(m->output_vocab_size); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - double logz = logsum(scores.col(0)); - log_prob = weight * (scores(output, 0) - logz); - } - else - { - if (prop.skip_hidden) - log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); - else - log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); - } - stop_timer(3); - - if (cache_size) - { - // Update cache - cache_keys.col(hash) = ngram; - cache_values[hash] = log_prob; - } - - #ifdef __INTEL_MKL__ - mkl_set_num_threads(save_mkl_threads); - #endif - Eigen::setNbThreads(save_eigen_threads); - omp_set_num_threads(save_threads); - - return log_prob; + // First look in cache + hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h + cache_lookups++; + if (cache_keys.col(hash) == ngram) + { + cache_hits++; + return cache_values[hash]; + } } - // Look up many n-grams in parallel. - template - void lookup_ngram(const Eigen::MatrixBase &ngram, const Eigen::MatrixBase &log_probs_const) - { - UNCONST(DerivedB, log_probs_const, log_probs); - assert (ngram.rows() == m->ngram_size); - //assert (ngram.cols() <= prop.get_minibatch_size()); - - prop.fProp(ngram); - - if (normalization) - { - Eigen::Matrix scores(m->output_vocab_size, ngram.cols()); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - - // And softmax and loss - Matrix output_probs(m->output_vocab_size, ngram.cols()); - double minibatch_log_likelihood; - SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); - for (int j=0; jngram_size-1, j); - log_probs(0, j) = weight * output_probs(output, j); - } - } - else - { - for (int j=0; jngram_size-1, j); - if (prop.skip_hidden) - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); - else - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); - } - } - } + // Make sure that we're single threaded. Multithreading doesn't help, + // and in some cases can hurt quite a lot + int save_threads = omp_get_max_threads(); + omp_set_num_threads(1); + int save_eigen_threads = Eigen::nbThreads(); + Eigen::setNbThreads(1); +#ifdef __INTEL_MKL__ + int save_mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); +#endif + + prop.fProp(ngram.col(0)); - int get_order() const { return m->ngram_size; } + int output = ngram(m->ngram_size-1, 0); + double log_prob; - void read(const std::string &filename) + start_timer(3); + if (normalization) { - m->read(filename); - resize(); - // this is faster but takes more memory - //m->premultiply(); + Eigen::Matrix scores(m->output_vocab_size); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + double logz = logsum(scores.col(0)); + log_prob = weight * (scores(output, 0) - logz); } - - void set_cache(std::size_t cache_size) + else { - this->cache_size = cache_size; - cache_keys.resize(m->ngram_size, cache_size); - cache_keys.fill(-1); // clears cache - cache_values.resize(cache_size); - cache_lookups = cache_hits = 0; + if (prop.skip_hidden) + log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); + else + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); } + stop_timer(3); - double cache_hit_rate() + if (cache_size) { - return static_cast(cache_hits)/cache_lookups; + // Update cache + cache_keys.col(hash) = ngram; + cache_values[hash] = log_prob; } - void premultiply() +#ifdef __INTEL_MKL__ + mkl_set_num_threads(save_mkl_threads); +#endif + Eigen::setNbThreads(save_eigen_threads); + omp_set_num_threads(save_threads); + + return log_prob; + } + + // Look up many n-grams in parallel. + template + void lookup_ngram(const Eigen::MatrixBase &ngram, const Eigen::MatrixBase &log_probs_const) + { + UNCONST(DerivedB, log_probs_const, log_probs); + assert (ngram.rows() == m->ngram_size); + //assert (ngram.cols() <= prop.get_minibatch_size()); + + prop.fProp(ngram); + + if (normalization) + { + Eigen::Matrix scores(m->output_vocab_size, ngram.cols()); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + + // And softmax and loss + Matrix output_probs(m->output_vocab_size, ngram.cols()); + double minibatch_log_likelihood; + SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); + for (int j=0; jngram_size-1, j); + log_probs(0, j) = weight * output_probs(output, j); + } + } + else + { + for (int j=0; jngram_size-1, j); + if (prop.skip_hidden) + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); + else + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + } + } + } + + int get_order() const { return m->ngram_size; } + + void read(const std::string &filename) + { + m->read(filename); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } + + void set_cache(std::size_t cache_size) + { + this->cache_size = cache_size; + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); // clears cache + cache_values.resize(cache_size); + cache_lookups = cache_hits = 0; + } + + double cache_hit_rate() + { + return static_cast(cache_hits)/cache_lookups; + } + + void premultiply() + { + if (!m->premultiplied) { - if (!m->premultiplied) - { - m->premultiply(); - } + m->premultiply(); } + } }; diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp index a2cac7a..d5fc16b 100644 --- a/src/prepareNeuralLM.cpp +++ b/src/prepareNeuralLM.cpp @@ -2,19 +2,19 @@ #include #include #include -# include -# include - -# include -# include -# include -# include -# include +#include +#include + +#include +#include +#include +#include +#include #include #include #include -# include +#include #include "neuralLM.h" #include "util.h" @@ -36,314 +36,313 @@ typedef std::vector vecvec; typedef long long int data_size_t; // training data can easily exceed 2G instances template -void writeNgrams(const T &data, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename) - { - ofstream file(filename.c_str()); - if (!file) +void writeNgrams(const T &data, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename) +{ + ofstream file(filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + vector > ngrams; + + for (int i=0; i > ngrams; - - for (int i=0; i &sent_weights, - const string &sent_weights_filename) +void writeNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + int train_data_size, + vector &sent_weights, + const string &sent_weights_filename) { - ofstream file(filename.c_str()); - ofstream output_sent_weights_file(sent_weights_filename.c_str()); - if (!file) - { - cerr << "error: could not open " << filename << endl; - exit(1); + ofstream file(filename.c_str()); + ofstream output_sent_weights_file(sent_weights_filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + ifstream input_file(input_filename.c_str()); + vector > ngrams; + //for (int i=0; i 0) { + counter++; + if ((counter % 100000) == 0) { + cerr< > ngrams; - //for (int i=0; i 0) { - counter++; - if ((counter % 100000) == 0) { - cerr< lstr_items; - splitBySpace(line,lstr_items); + //stringstream lstr(line); + vector lstr_items; + splitBySpace(line,lstr_items); //for (int i=0; i("data")(num_tokens,vec(ialloc),valloc); - - vec *mMapVec= mfile.construct("vector")(num_tokens*ngram_size,0,ialloc); - - cerr<<"The size of mmaped vec is "<size()< > ngrams; - while (getline(input_file,line) && train_data_size-- > 0) { - counter++; - if ((counter % 100000) ==0) { - //cerr<<"counter is "< lstr_items; - splitBySpace(line,lstr_items); + cerr<<"Num tokens is "<("data")(num_tokens,vec(ialloc),valloc); + + vec *mMapVec= mfile.construct("vector")(num_tokens*ngram_size,0,ialloc); + + cerr<<"The size of mmaped vec is "<size()< > ngrams; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) ==0) { + //cerr<<"counter is "< lstr_items; + splitBySpace(line,lstr_items); //for (int i=0; iat(train_ngram_counter*ngram_size+k) = ngrams[j][k]; - } - train_ngram_counter++; - //cerr<<"Train ngram counter is "<at(train_ngram_counter*ngram_size+k) = ngrams[j][k]; + } + train_ngram_counter++; + //cerr<<"Train ngram counter is "<= num_tokens) { - upper_limit = num_tokens; - vector_size = num_tokens - counter; - } - vector temp(vector_size*ngram_size,0); - for (int i=0;iat((i+counter)*ngram_size+k); - } - } - for (data_size_t i=vector_size-1; i>0; i--) - { - if (i %500000 == 0) { - cerr<<"Shuffled "<(0, i-1)(rng); - for (int k=0;kat((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; - } - } - counter = upper_limit; + } + cerr<= num_tokens) { + upper_limit = num_tokens; + vector_size = num_tokens - counter; + } + vector temp(vector_size*ngram_size,0); + for (int i=0;iat((i+counter)*ngram_size+k); } - - /* - for (data_size_t i=num_tokens-1; i>0; i--) + } + for (data_size_t i=vector_size-1; i>0; i--) { if (i %500000 == 0) { cerr<<"Shuffled "<(0, i-1)(rng); for (int k=0;kat(i*ngram_size+k); - mMapVec->at(i*ngram_size+k) = - mMapVec->at(j*ngram_size+k); - mMapVec->at(j*ngram_size+k) = temp_val; + int temp_val = temp.at(i*ngram_size+k); + temp.at(i*ngram_size+k) = + temp.at(j*ngram_size+k); + temp.at(j*ngram_size+k) = temp_val; } } - */ - cerr<at((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; + } + } + counter = upper_limit; } + + /* + for (data_size_t i=num_tokens-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<(0, i-1)(rng); + for (int k=0;kat(i*ngram_size+k); + mMapVec->at(i*ngram_size+k) = + mMapVec->at(j*ngram_size+k); + mMapVec->at(j*ngram_size+k) = temp_val; + } + } + */ + cerr< arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); ValueArg arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); ValueArg arg_add_start_stop("", "add_start_stop", "If true, prepend and append . Default: true.", false, true, "bool", cmd); ValueArg arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is " - "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); + "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); ValueArg arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd); ValueArg arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd); ValueArg arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by .", false, "", "string", cmd); ValueArg arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd); - ValueArg arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); + ValueArg arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); ValueArg arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd); - ValueArg arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); - ValueArg arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); - ValueArg arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); - ValueArg arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); - //ValueArg arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); - //ValueArg arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); - - - - cmd.parse(argc, argv); - - train_text = arg_train_text.getValue(); - train_file = arg_train_file.getValue(); - validation_text = arg_validation_text.getValue(); - validation_file = arg_validation_file.getValue(); - validation_size = arg_validation_size.getValue(); - write_words_file = arg_write_words_file.getValue(); - ngram_size = arg_ngram_size.getValue(); - vocab_size = arg_vocab_size.getValue(); - words_file = arg_words_file.getValue(); - numberize = arg_numberize.getValue(); - ngramize = arg_ngramize.getValue(); - add_start_stop = arg_add_start_stop.getValue(); - mmap_file = arg_mmap_file.getValue(); - randomize = arg_randomize.getValue(); - //sent_weights_text = arg_sent_weights_text.getValue(); - //output_sent_weights_text = arg_sent_weights_file.getValue(); - sent_weights_text = ""; - output_sent_weights_text = ""; + ValueArg arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); + ValueArg arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); + ValueArg arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); + //ValueArg arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); + //ValueArg arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); + + + cmd.parse(argc, argv); + + train_text = arg_train_text.getValue(); + train_file = arg_train_file.getValue(); + validation_text = arg_validation_text.getValue(); + validation_file = arg_validation_file.getValue(); + validation_size = arg_validation_size.getValue(); + write_words_file = arg_write_words_file.getValue(); + ngram_size = arg_ngram_size.getValue(); + vocab_size = arg_vocab_size.getValue(); + words_file = arg_words_file.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); + mmap_file = arg_mmap_file.getValue(); + randomize = arg_randomize.getValue(); + //sent_weights_text = arg_sent_weights_text.getValue(); + //output_sent_weights_text = arg_sent_weights_file.getValue(); + sent_weights_text = ""; + output_sent_weights_text = ""; // check command line arguments @@ -364,292 +363,292 @@ int main(int argc, char *argv[]) cerr << "Command line: " << endl; cerr << boost::algorithm::join(vector(argv, argv+argc), " ") << endl; - - const string sep(" Value: "); - cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; - cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; - cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; - cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; - cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; - cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl; - cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; - cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl; - cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl; - cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; - cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; - cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; - cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl; - //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl; - } - catch (TCLAP::ArgException &e) - { - cerr << "error: " << e.error() << " for arg " << e.argId() << endl; - exit(1); - } - // VLF: why is this true? - // DC: it's because the vocabulary has to be constructed from the training data only. - // If the vocabulary is preset, we can't create the validation data. - // - if --numberize 0 is set, then --validation_size cannot be used. - // if (!numberize && (validation_size > 0)) { - // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; - // } - - // Read in training data and validation data - // vector > train_data; - // readSentFile(train_text, train_data); - // @vaswani: No more reading the entire training file into memory - // Reading it per line with file io - - //for (int i=0; i"); - vocab.insert_word(""); - vocab.insert_word(""); - // warn user that if --numberize is not set, there will be no vocabulary! - if (!numberize) { - cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; - } - } - if (mmap_file == false && randomize == true) { - cerr<<"Randomize option can only be used with mmap_file = 1"< 0)) { + // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; + // } + + // Read in training data and validation data + // vector > train_data; + // readSentFile(train_text, train_data); + // @vaswani: No more reading the entire training file into memory + // Reading it per line with file io + + //for (int i=0; i"); + vocab.insert_word(""); + vocab.insert_word(""); + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; } - unordered_map count; // For keeping word counts if no supplied vocab - - deque > validation_data; - int train_data_size=0; - cerr<<"Processed ... "; - data_size_t num_tokens=0; - - ifstream training(train_text.c_str()); - - string line; - while (getline(training,line)) { - train_data_size++; - //stringstream lstr(line); - vector lstr_items; - splitBySpace(line,lstr_items); - // if data is already ngramized, set/check ngram_size - if (!ngramize) { - if (ngram_size > 0) { - if (ngram_size != lstr_items.size()) { - cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=lstr_items.size(); - } + } + if (mmap_file == false && randomize == true) { + cerr<<"Randomize option can only be used with mmap_file = 1"< count; // For keeping word counts if no supplied vocab + + deque > validation_data; + int train_data_size=0; + cerr<<"Processed ... "; + data_size_t num_tokens=0; + + ifstream training(train_text.c_str()); + + string line; + while (getline(training,line)) { + train_data_size++; + //stringstream lstr(line); + vector lstr_items; + splitBySpace(line,lstr_items); + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + if (ngram_size > 0) { + if (ngram_size != lstr_items.size()) { + cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; + } } - if ((train_data_size%100000)==0){ - cerr< 0) { - //cerr<<"validation size is "< 0) { + //cerr<<"validation size is "< > validation_data; - if (validation_text != "") { - readSentFile(validation_text, validation_data); - for (int i=0; i 0) { - if (ngram_size != validation_data[i].size()) { - cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=validation_data[i].size(); - } - } + } + //vector > validation_data; + if (validation_text != "") { + readSentFile(validation_text, validation_data); + for (int i=0; i 0) { + if (ngram_size != validation_data[i].size()) { + cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; + } } + // else if --ngram_size has not been specified, set it now + else { + ngram_size=validation_data[i].size(); + } + } } - //READING SENTENCE WEIGHTS IF THERE ARE ANY - vector sent_weights; - if (sent_weights_text != "") { - cerr<<"Reading sentence weights from "< sent_weights; + if (sent_weights_text != "") { + cerr<<"Reading sentence weights from "< 0) { - // Create validation data - if (validation_size > train_data.size()) - { - cerr << "error: requested validation size is greater than training data size" << endl; - exit(1); - } - validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); - train_data.resize(train_data.size() - validation_size); + // Create validation data + if (validation_size > train_data.size()) + { + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); } - */ - - // Construct vocabulary - //vocabulary vocab; - //int start, stop; - - // read vocabulary from file - if (words_file != "") { - vector words; - readWordsFile(words_file,words); - for(vector::iterator it = words.begin(); it != words.end(); ++it) { - vocab.insert_word(*it); - } - - // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file - if (vocab_size > 0) { - if (vocab.size() != vocab_size) { - cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; - } - } - // else, set it to the size of vocabulary read from file - else { - vocab_size = vocab.size(); - } - + validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); + train_data.resize(train_data.size() - validation_size); } - /* - // construct vocabulary to contain top most frequent words; all other words replaced by - else { - vocab.insert_word(""); - vocab.insert_word(""); - vocab.insert_word(""); - - // warn user that if --numberize is not set, there will be no vocabulary! - if (!numberize) { - cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; - } - unordered_map count; - for (int i=0; i words; + readWordsFile(words_file,words); + for(vector::iterator it = words.begin(); it != words.end(); ++it) { + vocab.insert_word(*it); } - */ - // write vocabulary to file - if (write_words_file != "") { - cerr << "Writing vocabulary to " << write_words_file << endl; - writeWordsFile(vocab.words(), write_words_file); + // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file + if (vocab_size > 0) { + if (vocab.size() != vocab_size) { + cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; + } } - - // Write out numberized n-grams - if (train_file != "") - { - cerr << "Writing training data to " << train_file << endl; - if (mmap_file == true) { - writeMmapNgrams(train_text, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - train_file, - train_data_size, - num_tokens, - randomize); - } else { - writeNgrams(train_text, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - train_file, - train_data_size, - sent_weights, - output_sent_weights_text); - } + // else, set it to the size of vocabulary read from file + else { + vocab_size = vocab.size(); } - if (validation_file != "") - { - cerr << "Writing validation data to " << validation_file << endl; - writeNgrams(validation_data, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - validation_file); + + } + /* + // construct vocabulary to contain top most frequent words; all other words replaced by + else { + vocab.insert_word(""); + vocab.insert_word(""); + vocab.insert_word(""); + + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; + } + unordered_map count; + for (int i=0; i input_layer_node; - Node first_hidden_linear_node; - Node first_hidden_activation_node; - Node second_hidden_linear_node; - Node second_hidden_activation_node; - Node output_layer_node; - bool skip_hidden; - -public: - propagator () : minibatch_size(0), pnn(0) { } - - propagator (model &nn, int minibatch_size) + int minibatch_size; + model *pnn; + + public: + Node input_layer_node; + Node first_hidden_linear_node; + Node first_hidden_activation_node; + Node second_hidden_linear_node; + Node second_hidden_activation_node; + Node output_layer_node; + bool skip_hidden; + + public: + propagator () : minibatch_size(0), pnn(0) { } + + propagator (model &nn, int minibatch_size) : - pnn(&nn), - input_layer_node(&nn.input_layer, minibatch_size), - first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), - first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), - second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), - second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), - output_layer_node(&nn.output_layer, minibatch_size), - minibatch_size(minibatch_size) - { - skip_hidden = (nn.num_hidden == 0); - } + pnn(&nn), + input_layer_node(&nn.input_layer, minibatch_size), + first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), + first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), + second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), + second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), + output_layer_node(&nn.output_layer, minibatch_size), + minibatch_size(minibatch_size) + { + skip_hidden = (nn.num_hidden == 0); + } - // This must be called if the underlying model is resized. - void resize(int minibatch_size) { - this->minibatch_size = minibatch_size; - input_layer_node.resize(minibatch_size); - first_hidden_linear_node.resize(minibatch_size); - first_hidden_activation_node.resize(minibatch_size); - second_hidden_linear_node.resize(minibatch_size); - second_hidden_activation_node.resize(minibatch_size); - output_layer_node.resize(minibatch_size); - } + // This must be called if the underlying model is resized. + void resize(int minibatch_size) { + this->minibatch_size = minibatch_size; + input_layer_node.resize(minibatch_size); + first_hidden_linear_node.resize(minibatch_size); + first_hidden_activation_node.resize(minibatch_size); + second_hidden_linear_node.resize(minibatch_size); + second_hidden_activation_node.resize(minibatch_size); + output_layer_node.resize(minibatch_size); + } - void resize() { resize(minibatch_size); } + void resize() { resize(minibatch_size); } - template - void fProp(const MatrixBase &data) + template + void fProp(const MatrixBase &data) + { + if (!pnn->premultiplied) { - if (!pnn->premultiplied) - { - start_timer(0); - input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); - stop_timer(0); - - start_timer(1); - first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, - first_hidden_linear_node.fProp_matrix); - } - else - { - int n_inputs = first_hidden_linear_node.param->n_inputs(); - USCMatrix sparse_data; - input_layer_node.param->munge(data, sparse_data); - - start_timer(1); - first_hidden_linear_node.param->fProp(sparse_data, - first_hidden_linear_node.fProp_matrix); - } - first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); - //std::cerr<<"in fprop first hidden activation node fprop is "<fProp(first_hidden_activation_node.fProp_matrix, - second_hidden_linear_node.fProp_matrix); - second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); - stop_timer(2); - } - - // The propagation stops here because the last layer is very expensive. - } + start_timer(0); + input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); + stop_timer(0); - // Dense version (for standard log-likelihood) - template - void bProp(const MatrixBase &data, - const MatrixBase &output, - double learning_rate, - double momentum, - double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) + start_timer(1); + first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, + first_hidden_linear_node.fProp_matrix); + } + else { - // Output embedding layer - - start_timer(7); - output_layer_node.param->bProp(output, - output_layer_node.bProp_matrix); - stop_timer(7); - - start_timer(8); - Node & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; - if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, - output, - learning_rate, - momentum); - } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, - output, - learning_rate); - } else if (parameter_update == "ADAD") { - //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, - output, - 1.0/current_minibatch_size, - conditioning_constant, - decay); - } else { - std::cerr<<"Parameter update :"<n_inputs(); + USCMatrix sparse_data; + input_layer_node.param->munge(data, sparse_data); + + start_timer(1); + first_hidden_linear_node.param->fProp(sparse_data, + first_hidden_linear_node.fProp_matrix); + } + first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + //std::cerr<<"in fprop first hidden activation node fprop is "<fProp(first_hidden_activation_node.fProp_matrix, + second_hidden_linear_node.fProp_matrix); + second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + stop_timer(2); } - // Sparse version (for NCE log-likelihood) - template - void bProp(const MatrixBase &data, - const MatrixBase &samples, - const MatrixBase &weights, - double learning_rate, - double momentum, - double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) - { + // The propagation stops here because the last layer is very expensive. + } + + // Dense version (for standard log-likelihood) + template + void bProp(const MatrixBase &data, + const MatrixBase &output, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) + { + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(output, + output_layer_node.bProp_matrix); + stop_timer(7); + + start_timer(8); + Node & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, + output, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, + output, + learning_rate); + } else if (parameter_update == "ADAD") { + //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, + output, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<bProp(samples, - weights, - output_layer_node.bProp_matrix); - stop_timer(7); - - - start_timer(8); - Node & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; - if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, - samples, - weights, - learning_rate, - momentum); - } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, - samples, - weights, - learning_rate); - } else if (parameter_update == "ADAD") { - int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); - //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, - samples, - weights, - 1.0/current_minibatch_size, - conditioning_constant, - decay); - } else { - std::cerr<<"Parameter update :"< + void bProp(const MatrixBase &data, + const MatrixBase &samples, + const MatrixBase &weights, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) + { - bPropRest(data, - learning_rate, - momentum, - L2_reg, - parameter_update, - conditioning_constant, - decay); + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(samples, + weights, + output_layer_node.bProp_matrix); + stop_timer(7); + + + start_timer(8); + Node & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, + samples, + weights, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"< - void bPropRest(const MatrixBase &data, - double learning_rate, double momentum, double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) - { - // Second hidden layer + stop_timer(8); + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); + } - - // All the compute gradient functions are together and the backprop - // functions are together - ////////BACKPROP//////////// - start_timer(9); - if (skip_hidden) + private: + template + void bPropRest(const MatrixBase &data, + double learning_rate, double momentum, double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) { - start_timer(9); - first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + // Second hidden layer + + + + // All the compute gradient functions are together and the backprop + // functions are together + ////////BACKPROP//////////// + start_timer(9); + if (skip_hidden) + { + start_timer(9); + first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, first_hidden_activation_node.bProp_matrix, first_hidden_linear_node.fProp_matrix, first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(9); + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(9); - } - else - { - second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, - second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); + } + else + { + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); - second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.bProp_matrix); - stop_timer(9); + second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.bProp_matrix); + stop_timer(9); - start_timer(11); - first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, - first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); + start_timer(11); + first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(11); - } - //std::cerr<<"First hidden layer node backprop matrix is"<computeGradient(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, - momentum, - L2_reg); - stop_timer(10); + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(11); } - - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - learning_rate, momentum, L2_reg); - stop_timer(12); - - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, - data, - learning_rate, momentum, L2_reg); - stop_timer(13); - } else if (parameter_update == "ADA") { - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, - L2_reg); - stop_timer(10); + //std::cerr<<"First hidden layer node backprop matrix is"<computeGradient(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + momentum, + L2_reg); + stop_timer(10); + } + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, momentum, L2_reg); + stop_timer(13); + } else if (parameter_update == "ADA") { + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(10); + } + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, + L2_reg); + stop_timer(13); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(10); + } + //std::cerr<<"Finished gradient for second hidden linear layer"<