diff options
author | graehl <graehl@gmail.com> | 2015-06-25 09:22:21 +0300 |
---|---|---|
committer | graehl <graehl@gmail.com> | 2015-06-25 09:25:32 +0300 |
commit | 37e397f526fc207dea498356e890ad085a733ae8 (patch) | |
tree | cfea74b92cc4d38aaff06a26c76fdba7594abd69 | |
parent | 50308d573b90ff2814bd346210fc6929bd9b40af (diff) |
fix mixed tab2/tab4/spaces indents
-rw-r--r-- | src/Activation_function.h | 131 | ||||
-rw-r--r-- | src/SoftmaxLoss.h | 159 | ||||
-rw-r--r-- | src/USCMatrix.h | 227 | ||||
-rw-r--r-- | src/graphClasses.h | 89 | ||||
-rw-r--r-- | src/neuralClasses.h | 444 | ||||
-rw-r--r-- | src/neuralLM.h | 189 | ||||
-rw-r--r-- | src/neuralNetwork.h | 319 | ||||
-rw-r--r-- | src/prepareNeuralLM.cpp | 1057 | ||||
-rw-r--r-- | src/propagator.h | 641 | ||||
-rw-r--r-- | src/testNeuralLM.cpp | 279 | ||||
-rw-r--r-- | src/trainNeuralNetwork.cpp | 227 | ||||
-rw-r--r-- | src/util.h | 277 |
12 files changed, 2012 insertions, 2027 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h index 138f9da..742c2fc 100644 --- a/src/Activation_function.h +++ b/src/Activation_function.h @@ -3,7 +3,6 @@ #include <cmath> #include <string> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "util.h" @@ -19,28 +18,28 @@ enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunc inline activation_function_type string_to_activation_function (const std::string &s) { - if (s == "identity") - return Identity; - else if (s == "rectifier") - return Rectifier; - else if (s == "tanh") - return Tanh; - else if (s == "hardtanh") - return HardTanh; - else - return InvalidFunction; + if (s == "identity") + return Identity; + else if (s == "rectifier") + return Rectifier; + else if (s == "tanh") + return Tanh; + else if (s == "hardtanh") + return HardTanh; + else + return InvalidFunction; } inline std::string activation_function_to_string (activation_function_type f) { - if (f == Identity) - return "identity"; - else if (f == Rectifier) - return "rectifier"; - else if (f == Tanh) - return "tanh"; - else if (f == HardTanh) - return "hardtanh"; + if (f == Identity) + return "identity"; + else if (f == Rectifier) + return "rectifier"; + else if (f == Tanh) + return "tanh"; + else if (f == HardTanh) + return "hardtanh"; } struct hardtanh_functor { @@ -69,53 +68,53 @@ struct drectifier_functor { class Activation_function { - int size; - activation_function_type f; - - public: - Activation_function() : size(0), f(Rectifier) { } - - void resize(int size) { this->size = size; } - void set_activation_function(activation_function_type f) { this->f = f; } - - template <typename Engine> - void initialize(Engine &engine, bool init_normal, double init_range) { } - - int n_inputs () const { return size; } - int n_outputs () const { return size; } - - template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const - { - UNCONST(DerivedOut, output, my_output); - - switch (f) - { - case Identity: my_output = input; break; - case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; - case Tanh: my_output = input.unaryExpr(tanh_functor()); break; - case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; - case InvalidFunction: std::abort(); - } - } - - template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut> - void bProp(const MatrixBase<DerivedGOut> &input, - MatrixBase<DerivedGIn> &output, - const MatrixBase<DerivedIn> &finput, - const MatrixBase<DerivedOut> &foutput) const - { - UNCONST(DerivedGIn, output, my_output); - - switch (f) - { - case Identity: my_output = input; break; - case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; - case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; - case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; - case InvalidFunction: std::abort(); - } - } + int size; + activation_function_type f; + + public: + Activation_function() : size(0), f(Rectifier) { } + + void resize(int size) { this->size = size; } + void set_activation_function(activation_function_type f) { this->f = f; } + + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range) { } + + int n_inputs () const { return size; } + int n_outputs () const { return size; } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; + case Tanh: my_output = input.unaryExpr(tanh_functor()); break; + case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; + case InvalidFunction: std::abort(); + } + } + + template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut> + void bProp(const MatrixBase<DerivedGOut> &input, + MatrixBase<DerivedGIn> &output, + const MatrixBase<DerivedIn> &finput, + const MatrixBase<DerivedOut> &foutput) const + { + UNCONST(DerivedGIn, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; + case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; + case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; + case InvalidFunction: std::abort(); + } + } }; } // namespace nplm diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h index bc55762..d89cde6 100644 --- a/src/SoftmaxLoss.h +++ b/src/SoftmaxLoss.h @@ -1,7 +1,6 @@ - #ifndef SOFTMAXLOSS_H +#ifndef SOFTMAXLOSS_H #define SOFTMAXLOSS_H -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "multinomial.h" #include "util.h" @@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss }; inline loss_function_type string_to_loss_function (const std::string &s) { - if (s == "log") - return LogLoss; - else if (s == "nce") - return NCELoss; - else - return InvalidLoss; + if (s == "log") + return LogLoss; + else if (s == "nce") + return NCELoss; + else + return InvalidLoss; } inline std::string loss_function_to_string (loss_function_type f) { - if (f == LogLoss) - return "log"; - else if (f == NCELoss) - return "nce"; + if (f == LogLoss) + return "log"; + else if (f == NCELoss) + return "nce"; } /// Note: Outputs log-probabilities. struct SoftmaxLogLoss { - template <typename DerivedI, typename DerivedW, typename DerivedO> - void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss) + template <typename DerivedI, typename DerivedW, typename DerivedO> + void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + + double log_likelihood = 0.0; + +#pragma omp parallel for reduction(+:log_likelihood) + for (int train_id = 0; train_id < input.cols(); train_id++) { - UNCONST(DerivedO, output_const, output); - - double log_likelihood = 0.0; - - #pragma omp parallel for reduction(+:log_likelihood) - for (int train_id = 0; train_id < input.cols(); train_id++) - { - double normalization = logsum(input.col(train_id)); - output.col(train_id).array() = input.col(train_id).array() - normalization; - log_likelihood += output(output_words(train_id), train_id); - } - loss = log_likelihood; + double normalization = logsum(input.col(train_id)); + output.col(train_id).array() = input.col(train_id).array() - normalization; + log_likelihood += output(output_words(train_id), train_id); } - - template <typename DerivedW, typename DerivedO, typename DerivedI> - void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const) + loss = log_likelihood; + } + + template <typename DerivedW, typename DerivedO, typename DerivedI> + void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const) + { + UNCONST(DerivedI, grad_input_const, grad_input); + grad_input.setZero(); +#pragma omp parallel for + for (int train_id = 0; train_id < output.cols(); train_id++) { - UNCONST(DerivedI, grad_input_const, grad_input); - grad_input.setZero(); - #pragma omp parallel for - for (int train_id = 0; train_id < output.cols(); train_id++) - { - grad_input(output_words(train_id), train_id) += 1.; - grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); - } + grad_input(output_words(train_id), train_id) += 1.; + grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); } + } }; ///// Softmax layer plus NCE loss function. @@ -81,55 +80,55 @@ struct SoftmaxLogLoss template <typename Multinomial> class SoftmaxNCELoss { - const Multinomial &unigram; + const Multinomial &unigram; -public: - SoftmaxNCELoss(const Multinomial &unigram) + public: + SoftmaxNCELoss(const Multinomial &unigram) : unigram(unigram) + { + } + + template <typename DerivedI, typename DerivedW, typename DerivedO> + void fProp(const MatrixBase<DerivedI> &scores, + const MatrixBase<DerivedW> &minibatch_samples, + const MatrixBase<DerivedO> &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + double log_likelihood = 0.0; + int num_noise_samples = minibatch_samples.rows()-1; + double log_num_noise_samples = std::log(num_noise_samples); +#pragma omp parallel for reduction(+:log_likelihood) schedule(static) + for (int train_id = 0; train_id < scores.cols(); train_id++) { + for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) + { + int sample = minibatch_samples(sample_id, train_id); + // To avoid zero or infinite probabilities, + // never take exp of score without normalizing first, + // even if it's a little slower... + double score = scores(sample_id, train_id); + double score_noise = log_num_noise_samples + unigram.logprob(sample); + double z = logadd(score, score_noise); + double logprob = score - z; + double logprob_noise = score_noise - z; + output(sample_id, train_id) = std::exp(logprob); + log_likelihood += sample_id == 0 ? logprob : logprob_noise; + } } - - template <typename DerivedI, typename DerivedW, typename DerivedO> - void fProp(const MatrixBase<DerivedI> &scores, - const MatrixBase<DerivedW> &minibatch_samples, - const MatrixBase<DerivedO> &output_const, double &loss) - { - UNCONST(DerivedO, output_const, output); - double log_likelihood = 0.0; - int num_noise_samples = minibatch_samples.rows()-1; - double log_num_noise_samples = std::log(num_noise_samples); - #pragma omp parallel for reduction(+:log_likelihood) schedule(static) - for (int train_id = 0; train_id < scores.cols(); train_id++) - { - for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) - { - int sample = minibatch_samples(sample_id, train_id); - // To avoid zero or infinite probabilities, - // never take exp of score without normalizing first, - // even if it's a little slower... - double score = scores(sample_id, train_id); - double score_noise = log_num_noise_samples + unigram.logprob(sample); - double z = logadd(score, score_noise); - double logprob = score - z; - double logprob_noise = score_noise - z; - output(sample_id, train_id) = std::exp(logprob); - log_likelihood += sample_id == 0 ? logprob : logprob_noise; - } - } - loss = log_likelihood; - } - - template <typename DerivedO, typename DerivedI> - void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const) + loss = log_likelihood; + } + + template <typename DerivedO, typename DerivedI> + void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const) + { + UNCONST(DerivedI, output_const, output); +#pragma omp parallel for schedule(static) + for (int train_id = 0; train_id < probs.cols(); train_id++) { - UNCONST(DerivedI, output_const, output); - #pragma omp parallel for schedule(static) - for (int train_id = 0; train_id < probs.cols(); train_id++) - { - output.col(train_id) = -probs.col(train_id); - output(0, train_id) += 1.0; - } + output.col(train_id) = -probs.col(train_id); + output(0, train_id) += 1.0; } + } }; } // namespace nplm diff --git a/src/USCMatrix.h b/src/USCMatrix.h index 02aeb33..784fa1b 100644 --- a/src/USCMatrix.h +++ b/src/USCMatrix.h @@ -1,7 +1,6 @@ #ifndef USCMATRIX_H #define USCMATRIX_H -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "maybe_omp.h" #include "util.h" @@ -34,108 +33,108 @@ template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_ class USCMatrix { -public: - Matrix<Index,Dynamic,Dynamic> indexes; - Matrix<Scalar,Dynamic,Dynamic> values; - int m_rows; + public: + Matrix<Index,Dynamic,Dynamic> indexes; + Matrix<Scalar,Dynamic,Dynamic> values; + int m_rows; - USCMatrix() : m_rows(0) { } + USCMatrix() : m_rows(0) { } - template <typename Indexes, typename Values> - USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) - : - indexes(indexes), - values(values), - m_rows(rows) - { } + template <typename Indexes, typename Values> + USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) + : + indexes(indexes), + values(values), + m_rows(rows) + { } - USCMatrix(Index rows, Index nnz, Index cols) - : - indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), + USCMatrix(Index rows, Index nnz, Index cols) + : + indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)), m_rows(rows) - { - this->indexes.fill(-1); - } - - Index rows() const { return m_rows; } - Index cols() const { return indexes.cols(); } - - void resize(Index rows, Index nnz, Index cols) { - indexes.resize(nnz, cols); - values.resize(nnz, cols); - m_rows = rows; - } + { + this->indexes.fill(-1); + } + + Index rows() const { return m_rows; } + Index cols() const { return indexes.cols(); } + + void resize(Index rows, Index nnz, Index cols) { + indexes.resize(nnz, cols); + values.resize(nnz, cols); + m_rows = rows; + } }; // Dense matrix - sparse matrix product // a is presumably very wide template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC> -void uscgemm(double alpha, const MatrixBase<DerivedA> &a, - const USCMatrix<ScalarB,Index> &b, - const MatrixBase<DerivedC> &c_const) +void uscgemm(double alpha, const MatrixBase<DerivedA> &a, + const USCMatrix<ScalarB,Index> &b, + const MatrixBase<DerivedC> &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - #pragma omp parallel for - for (Index k=0; k<b.cols(); k++) - for (Index r=0; r<b.indexes.rows(); r++) - { - Index j = b.indexes(r,k); - eigen_assert(j >= 0); - eigen_assert(j < a.cols()); - c.col(k) += alpha * a.col(j) * b.values(r,k); - } + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + +#pragma omp parallel for + for (Index k=0; k<b.cols(); k++) + for (Index r=0; r<b.indexes.rows(); r++) + { + Index j = b.indexes(r,k); + eigen_assert(j >= 0); + eigen_assert(j < a.cols()); + c.col(k) += alpha * a.col(j) * b.values(r,k); + } } // sparse matrix - dense matrix product template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> -void uscgemm(double alpha, - const USCMatrix<ScalarA,Index> &a, - const MatrixBase<DerivedB> &b, - const MatrixBase<DerivedC> &c_const) +void uscgemm(double alpha, + const USCMatrix<ScalarA,Index> &a, + const MatrixBase<DerivedB> &b, + const MatrixBase<DerivedC> &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - // This needs to be tuned for each system, unfortunately, - // and seems to vary a lot. A lot. - int i_blocks = omp_get_num_threads()*16; - - // Assume only one block in k direction. - // We don't need to explicitly block in the j direction. - #pragma omp parallel for - for (Index ib=0; ib<i_blocks; ib++) - for (Index j=0; j<a.cols(); j++) - for (Index r=0; r<a.indexes.rows(); r++) - { - Index i = a.indexes(r,j); - eigen_assert(i >= 0); - eigen_assert(i < c.rows()); - if (i % i_blocks == ib) - c.row(i) += alpha * a.values(r,j) * b.row(j); - } - - /* + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + + // This needs to be tuned for each system, unfortunately, + // and seems to vary a lot. A lot. + int i_blocks = omp_get_num_threads()*16; + + // Assume only one block in k direction. + // We don't need to explicitly block in the j direction. +#pragma omp parallel for + for (Index ib=0; ib<i_blocks; ib++) + for (Index j=0; j<a.cols(); j++) + for (Index r=0; r<a.indexes.rows(); r++) + { + Index i = a.indexes(r,j); + eigen_assert(i >= 0); + eigen_assert(i < c.rows()); + if (i % i_blocks == ib) + c.row(i) += alpha * a.values(r,j) * b.row(j); + } + + /* If c.cols() is really large, then theoretically it seems like we should do: parallel for blocks in i direction - for blocks in j direction - pack block of a into smaller sparse matrix - for blocks in k direction - for k - for i (sparse) - for j - c(i,k) += a(i,j) * b(j,k) + for blocks in j direction + pack block of a into smaller sparse matrix + for blocks in k direction + for k + for i (sparse) + for j + c(i,k) += a(i,j) * b(j,k) However, the copying of blocks of a doesn't seem practical for any realistic sizes of c.cols(). - */ + */ } // Dense matrix - dense matrix product, but masked by a sparse matrix, @@ -147,45 +146,45 @@ void uscgemm(double alpha, template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index> void uscgemm_masked(double alpha, - const MatrixBase<DerivedA> &a, - const MatrixBase<DerivedB> &b, - USCMatrix<ScalarC,Index> &c) + const MatrixBase<DerivedA> &a, + const MatrixBase<DerivedB> &b, + USCMatrix<ScalarC,Index> &c) { - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - #pragma omp parallel for - for (Index k=0; k<b.cols(); k++) - for (Index r=0; r<c.indexes.rows(); r++) - { - Index i = c.indexes(r, k); - eigen_assert(i >= 0); - eigen_assert(i < a.rows()); - c.values(r, k) += alpha * a.row(i) * b.col(k); - } + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + +#pragma omp parallel for + for (Index k=0; k<b.cols(); k++) + for (Index r=0; r<c.indexes.rows(); r++) + { + Index i = c.indexes(r, k); + eigen_assert(i >= 0); + eigen_assert(i < a.rows()); + c.values(r, k) += alpha * a.row(i) * b.col(k); + } } // sparse matrix - dense vector product template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> -void uscgemv(double alpha, - const USCMatrix<ScalarA,Index> &a, - const MatrixBase<DerivedB> &b, - const MatrixBase<DerivedC> &c_const) +void uscgemv(double alpha, + const USCMatrix<ScalarA,Index> &a, + const MatrixBase<DerivedB> &b, + const MatrixBase<DerivedC> &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == 1 && c.cols() == 1); - - for (Index j=0; j<a.cols(); j++) - for (Index r=0; r<a.indexes.rows(); r++) - { - Index i = a.indexes(r,j); - eigen_assert(i >= 0); - eigen_assert(i < c.rows()); - c(i) += alpha * a.values(r,j) * b(j); - } + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == 1 && c.cols() == 1); + + for (Index j=0; j<a.cols(); j++) + for (Index r=0; r<a.indexes.rows(); r++) + { + Index i = a.indexes(r,j); + eigen_assert(i >= 0); + eigen_assert(i < c.rows()); + c(i) += alpha * a.values(r,j) * b(j); + } } } diff --git a/src/graphClasses.h b/src/graphClasses.h index d3c0c4a..cd80a4c 100644 --- a/src/graphClasses.h +++ b/src/graphClasses.h @@ -3,7 +3,6 @@ #include <cstdlib> #include "neuralClasses.h" -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> namespace nplm @@ -11,50 +10,50 @@ namespace nplm template <class X> class Node { - public: - X * param; //what parameter is this - //vector <void *> children; - //vector <void *> parents; - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; - int minibatch_size; - - public: - Node() : param(NULL), minibatch_size(0) { } - - Node(X *input_param, int minibatch_size) - : param(input_param), - minibatch_size(minibatch_size) - { - resize(minibatch_size); - } - - void resize(int minibatch_size) - { - this->minibatch_size = minibatch_size; - if (param->n_outputs() != -1) - { - fProp_matrix.setZero(param->n_outputs(), minibatch_size); - } - if (param->n_inputs() != -1) - { - bProp_matrix.setZero(param->n_inputs(), minibatch_size); - } - } - - void resize() { resize(minibatch_size); } - - /* - void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols) - { - param->fProp(input,fProp_matrix,0,0,n_cols); - } - void Fprop(Matrix<double,1,Dynamic> & input,int n_cols) - { - param->fProp(input,fProp_matrix,0,0,n_cols); - } - */ - //for f prop, just call the fProp node of the particular parameter. + public: + X * param; //what parameter is this + //vector <void *> children; + //vector <void *> parents; + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; + int minibatch_size; + + public: + Node() : param(NULL), minibatch_size(0) { } + + Node(X *input_param, int minibatch_size) + : param(input_param), + minibatch_size(minibatch_size) + { + resize(minibatch_size); + } + + void resize(int minibatch_size) + { + this->minibatch_size = minibatch_size; + if (param->n_outputs() != -1) + { + fProp_matrix.setZero(param->n_outputs(), minibatch_size); + } + if (param->n_inputs() != -1) + { + bProp_matrix.setZero(param->n_inputs(), minibatch_size); + } + } + + void resize() { resize(minibatch_size); } + + /* + void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + void Fprop(Matrix<double,1,Dynamic> & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + */ + //for f prop, just call the fProp node of the particular parameter. }; diff --git a/src/neuralClasses.h b/src/neuralClasses.h index 26dae06..ee7c3f0 100644 --- a/src/neuralClasses.h +++ b/src/neuralClasses.h @@ -6,8 +6,7 @@ #include <cmath> #include <vector> -#include <boost/unordered_map.hpp> -//#include <../3rdparty/Eigen/Dense> +#include <boost/unordered_map.hpp> #include <Eigen/Dense> #include "maybe_omp.h" @@ -35,7 +34,7 @@ using Eigen::Dynamic; typedef boost::unordered_map<int,bool> int_map; struct Clipper{ - double operator() (double x) const { + double operator() (double x) const { return std::min(0.5, std::max(x,-0.5)); //return(x); } @@ -44,7 +43,7 @@ struct Clipper{ class Linear_layer { - private: + private: Matrix<double,Dynamic,Dynamic> U; Matrix<double,Dynamic,Dynamic> U_gradient; Matrix<double,Dynamic,Dynamic> U_velocity; @@ -60,12 +59,12 @@ class Linear_layer friend class model; public: - Linear_layer() { } + Linear_layer() { } Linear_layer(int rows, int cols) { resize(rows, cols); } - void resize(int rows, int cols) - { - U.setZero(rows, cols); + void resize(int rows, int cols) + { + U.setZero(rows, cols); U_gradient.setZero(rows, cols); //U_running_gradient.setZero(rows, cols); //U_running_parameter_updates.setZero(rows, cols); @@ -74,21 +73,21 @@ class Linear_layer b_gradient.setZero(rows); //b_running_gradient.resize(rows); //b_velocity.resize(rows); - } + } - void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } - void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } + void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } + void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } - template <typename Engine> - void initialize(Engine &engine, + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range, string ¶meter_update, double adagrad_epsilon) - { + { if (parameter_update == "ADA") { U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; @@ -100,58 +99,58 @@ class Linear_layer b_running_parameter_update.setZero(b.size()); } - initMatrix(engine, U, init_normal, init_range); + initMatrix(engine, U, init_normal, init_range); initBias(engine, b, init_normal, init_range); - } + } - int n_inputs () const { return U.cols(); } - int n_outputs () const { return U.rows(); } + int n_inputs () const { return U.cols(); } + int n_outputs () const { return U.rows(); } template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, + void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const { UNCONST(DerivedOut, output, my_output); my_output.leftCols(input.cols()).noalias() = U*input; int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) + for (int example = 0;example < num_examples;example++) { my_output.leftCols(input.cols()).col(example) += b; } } - // Sparse input + // Sparse input template <typename ScalarIn, typename DerivedOut> - void fProp(const USCMatrix<ScalarIn> &input, + void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const { - UNCONST(DerivedOut, output_const, output); - output.setZero(); - uscgemm(1.0, U, input, output.leftCols(input.cols())); - // Each column corresponds to a training example. We + UNCONST(DerivedOut, output_const, output); + output.setZero(); + uscgemm(1.0, U, input, output.leftCols(input.cols())); + // Each column corresponds to a training example. We // parallelize the adding of biases per dimension. int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) + for (int example = 0;example < num_examples;example++) { output.leftCols(input.cols()).col(example) += b; } } template <typename DerivedGOut, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOut> &input, + void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const { - UNCONST(DerivedGIn, output, my_output); - my_output.noalias() = U.transpose()*input; - } + UNCONST(DerivedGIn, output, my_output); + my_output.noalias() = U.transpose()*input; + } template <typename DerivedGOut, typename DerivedIn> - void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, + void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, double learning_rate, double momentum, double L2_reg) { U_gradient.noalias() = bProp_input*fProp_input.transpose(); - + // get the bias gradient for all dimensions in parallel int size = b.size(); b_gradient = bProp_input.rowwise().sum(); @@ -172,7 +171,7 @@ class Linear_layer { U += learning_rate * U_gradient; b += learning_rate * b_gradient; - /* + /* //UPDATE CLIPPING U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); @@ -181,17 +180,17 @@ class Linear_layer //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); */ } - } + } template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, double learning_rate, double L2_reg) { U_gradient.noalias() = bProp_input*fProp_input.transpose(); - + // get the bias gradient for all dimensions in parallel int size = b.size(); b_gradient.noalias() = bProp_input.rowwise().sum(); @@ -206,7 +205,7 @@ class Linear_layer #pragma omp parallel for for (int col=0; col<U.cols(); col++) { U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix(); - U.col(col) += learning_rate * (U_gradient.col(col).array() / + U.col(col) += learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt()).matrix(); /* //UPDATE CLIPPING @@ -223,8 +222,8 @@ class Linear_layer } template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, + void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, double learning_rate, double L2_reg, double conditioning_constant, @@ -234,7 +233,7 @@ class Linear_layer U_gradient.noalias() = bProp_input*fProp_input.transpose(); Array<double,Dynamic,1> b_current_parameter_update; - + // get the bias gradient for all dimensions in parallel int size = b.size(); b_gradient.noalias() = bProp_input.rowwise().sum(); @@ -250,7 +249,7 @@ class Linear_layer //cerr<<"U gradient is "<<U_gradient<<endl; for (int col=0; col<U.cols(); col++) { Array<double,Dynamic,1> U_current_parameter_update; - U_running_gradient.col(col) = decay*U_running_gradient.col(col) + + U_running_gradient.col(col) = decay*U_running_gradient.col(col) + (1-decay)*U_gradient.col(col).array().square().matrix(); //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; //getchar(); @@ -262,22 +261,22 @@ class Linear_layer //update the running parameter update U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) + (1.-decay)*U_current_parameter_update.square().matrix(); - U.col(col) += learning_rate*U_current_parameter_update.matrix(); + U.col(col) += learning_rate*U_current_parameter_update.matrix(); } - b_running_gradient = decay*b_running_gradient + + b_running_gradient = decay*b_running_gradient + (1.-decay)*b_gradient.array().square().matrix(); b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ (b_running_gradient.array()+conditioning_constant).sqrt()) * b_gradient.array(); - b_running_parameter_update = decay*(b_running_parameter_update) + + b_running_parameter_update = decay*(b_running_parameter_update) + (1.-decay)*b_current_parameter_update.square().matrix(); b += learning_rate*b_current_parameter_update.matrix(); } template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> - void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, const MatrixBase<DerivedGW> &gradient) const { UNCONST(DerivedGW, gradient, my_gradient); @@ -355,17 +354,17 @@ class Output_word_embeddings template <typename DerivedIn, typename DerivedOut> void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const - { + { UNCONST(DerivedOut, output, my_output); my_output = ((*W) * input).colwise() + b; - } + } - // Sparse output version + // Sparse output version template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOutI> &samples, const MatrixBase<DerivedOutV> &output) const - { + { UNCONST(DerivedOutV, output, my_output); #pragma omp parallel for for (int instance_id = 0; instance_id < samples.cols(); instance_id++) @@ -378,13 +377,13 @@ class Output_word_embeddings USCMatrix<double> sparse_output(W->rows(), samples, my_output); uscgemm_masked(1.0, *W, input, sparse_output); my_output = sparse_output.values; // too bad, so much copying - } + } // Return single element of output matrix template <typename DerivedIn> - double fProp(const MatrixBase<DerivedIn> &input, + double fProp(const MatrixBase<DerivedIn> &input, int word, - int instance) const + int instance) const { return W->row(word).dot(input.col(instance)) + b(word); } @@ -395,19 +394,19 @@ class Output_word_embeddings void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix, const MatrixBase<DerivedGIn> &bProp_matrix) const { - // W is vocab_size x output_embedding_dimension - // input_bProp_matrix is vocab_size x minibatch_size - // bProp_matrix is output_embedding_dimension x minibatch_size - UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); - my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = + // W is vocab_size x output_embedding_dimension + // input_bProp_matrix is vocab_size x minibatch_size + // bProp_matrix is output_embedding_dimension x minibatch_size + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = W->transpose() * input_bProp_matrix; - } + } template <typename DerivedIn, typename DerivedGOut> void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOut> &bProp_input, double learning_rate, - double momentum) //not sure if we want to use momentum here + double momentum) //not sure if we want to use momentum here { // W is vocab_size x output_embedding_dimension // b is vocab_size x 1 @@ -418,15 +417,15 @@ class Output_word_embeddings /* //GRADIENT CLIPPING - W->noalias() += learning_rate * + W->noalias() += learning_rate * ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); //UPDATE CLIPPING - W->noalias() += (learning_rate * + W->noalias() += (learning_rate * (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); */ - } + } template <typename DerivedIn, typename DerivedGOut> void computeGradientAdagrad( @@ -451,7 +450,7 @@ class Output_word_embeddings *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); */ - } + } template <typename DerivedIn, typename DerivedGOut> void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, @@ -480,14 +479,14 @@ class Output_word_embeddings b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ (b_running_gradient.array()+conditioning_constant).sqrt())* b_gradient.array(); - W_running_parameter_update = decay*W_running_parameter_update + + W_running_parameter_update = decay*W_running_parameter_update + (1.-decay)*W_current_parameter_update.square().matrix(); b_running_parameter_update = decay*b_running_parameter_update + (1.-decay)*b_current_parameter_update.square().matrix(); *W += learning_rate*W_current_parameter_update.matrix(); b += learning_rate*b_current_parameter_update.matrix(); - } + } // Sparse versions @@ -499,46 +498,46 @@ class Output_word_embeddings UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); my_bProp_matrix.setZero(); uscgemm(1.0, - W->transpose(), + W->transpose(), USCMatrix<double>(W->rows(), samples, weights), my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate, double momentum) //not sure if we want to use momentum here - { + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, double momentum) //not sure if we want to use momentum here + { //cerr<<"in gradient"<<endl; - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(learning_rate, + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(learning_rate, gradient_output, predicted_embeddings.leftCols(gradient_output.cols()).transpose(), *W); // narrow predicted_embeddings for possible short minibatch - uscgemv(learning_rate, + uscgemv(learning_rate, gradient_output, - Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), + Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), b); /* //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT //FIRST - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(1.0, + uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), + Matrix<double,Dynamic,1>::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; - // Convert to std::vector for parallelization + // Convert to std::vector for parallelization std::vector<int> update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) update_items.push_back(it->first); @@ -560,33 +559,33 @@ class Output_word_embeddings } */ //cerr<<"Finished gradient"<<endl; - } + } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate) //not sure if we want to use momentum here + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate) //not sure if we want to use momentum here { - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(1.0, + uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), + Matrix<double,Dynamic,1>::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; - // Convert to std::vector for parallelization + // Convert to std::vector for parallelization std::vector<int> update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) update_items.push_back(it->first); @@ -611,34 +610,34 @@ class Output_word_embeddings } } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, double conditioning_constant, double decay) //not sure if we want to use momentum here { //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); - uscgemv(1.0, + uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), + Matrix<double,Dynamic,1>::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; - // Convert to std::vector for parallelization + // Convert to std::vector for parallelization std::vector<int> update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) update_items.push_back(it->first); @@ -685,24 +684,24 @@ class Output_word_embeddings } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOutI> &samples, const MatrixBase<DerivedGOutV> &weights, const MatrixBase<DerivedGW> &gradient_W, const MatrixBase<DerivedGb> &gradient_b) const { - UNCONST(DerivedGW, gradient_W, my_gradient_W); - UNCONST(DerivedGb, gradient_b, my_gradient_b); - my_gradient_W.setZero(); - my_gradient_b.setZero(); - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, + UNCONST(DerivedGW, gradient_W, my_gradient_W); + UNCONST(DerivedGb, gradient_b, my_gradient_b); + my_gradient_W.setZero(); + my_gradient_b.setZero(); + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), my_gradient_W); - uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); + uscgemv(1.0, gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); } }; @@ -715,12 +714,12 @@ class Input_word_embeddings Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; - friend class model; + friend class model; public: Input_word_embeddings() : context_size(0), vocab_size(0) { } Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { W = input_W; } @@ -747,7 +746,7 @@ class Input_word_embeddings if (parameter_update == "ADA") { W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; //W_gradient.setZero(W->rows(),W->cols()); - } + } if (parameter_update == "ADAD") { W_running_gradient.setZero(W->rows(),W->cols()); //W_gradient.setZero(W->rows(),W->cols()); @@ -759,59 +758,59 @@ class Input_word_embeddings init_range); } - int n_inputs() const { return -1; } - int n_outputs() const { return W->cols() * context_size; } + int n_inputs() const { return -1; } + int n_outputs() const { return W->cols() * context_size; } - // set output_id's embedding to the weighted average of all embeddings - template <typename Dist> - void average(const Dist &dist, int output_id) - { - W->row(output_id).setZero(); - for (int i=0; i < W->rows(); i++) - if (i != output_id) - W->row(output_id) += dist.prob(i) * W->row(i); - } + // set output_id's embedding to the weighted average of all embeddings + template <typename Dist> + void average(const Dist &dist, int output_id) + { + W->row(output_id).setZero(); + for (int i=0; i < W->rows(); i++) + if (i != output_id) + W->row(output_id) += dist.prob(i) * W->row(i); + } - template <typename DerivedIn, typename DerivedOut> + template <typename DerivedIn, typename DerivedOut> void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const + const MatrixBase<DerivedOut> &output) const { int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // output is ngram_size*embedding_dimension x minibatch_size - - /* - // Dense version: - for (int ngram=0; ngram<context_size; ngram++) - output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); - */ - - UNCONST(DerivedOut, output, my_output); - my_output.setZero(); - for (int ngram=0; ngram<context_size; ngram++) - { - // input might be narrower than expected due to a short minibatch, - // so narrow output to match - uscgemm(1.0, - W->transpose(), + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // output is ngram_size*embedding_dimension x minibatch_size + + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); + */ + + UNCONST(DerivedOut, output, my_output); + my_output.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + { + // input might be narrower than expected due to a short minibatch, + // so narrow output to match + uscgemm(1.0, + W->transpose(), USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); - } + } } - // When model is premultiplied, this layer doesn't get used, - // but this method is used to get the input into a sparse matrix. - // Hopefully this can get eliminated someday - template <typename DerivedIn, typename ScalarOut> - void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const - { - output.resize(vocab_size*context_size, context_size, input.cols()); - for (int i=0; i < context_size; i++) - output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; - output.values.fill(1.0); - } + // When model is premultiplied, this layer doesn't get used, + // but this method is used to get the input into a sparse matrix. + // Hopefully this can get eliminated someday + template <typename DerivedIn, typename ScalarOut> + void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const + { + output.resize(vocab_size*context_size, context_size, input.cols()); + for (int i=0; i < context_size; i++) + output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; + output.values.fill(1.0); + } template <typename DerivedGOut, typename DerivedIn> void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, @@ -820,45 +819,45 @@ class Input_word_embeddings { int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // bProp_input is ngram_size*embedding_dimension x minibatch_size + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // bProp_input is ngram_size*embedding_dimension x minibatch_size - /* - // Dense version: - for (int ngram=0; ngram<context_size; ngram++) - W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() - */ + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() + */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(learning_rate, - USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), - *W); - } + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(learning_rate, + USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), + *W); + } /* //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN //PERFORM CLIPPING WHILE UPDATING - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); - } + } int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { + for (int ngram=0; ngram<context_size; ngram++) + { for (int train_id=0; train_id<input_words.cols(); train_id++) { update_map[input_words(ngram,train_id)] = 1; } } - // Convert to std::vector for parallelization + // Convert to std::vector for parallelization std::vector<int> update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { @@ -884,33 +883,33 @@ class Input_word_embeddings template <typename DerivedGOut, typename DerivedIn> void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, double L2_reg) { int embedding_dimension = W->cols(); - //W_gradient.setZero(W->rows(), W->cols()); + //W_gradient.setZero(W->rows(), W->cols()); /* if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); - } + } int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { + for (int ngram=0; ngram<context_size; ngram++) + { for (int train_id=0; train_id<input_words.cols(); train_id++) { update_map[input_words(ngram,train_id)] = 1; } } - // Convert to std::vector for parallelization + // Convert to std::vector for parallelization std::vector<int> update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { @@ -923,11 +922,11 @@ class Input_word_embeddings { int update_item = update_items[item_id]; W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); - W->row(update_item) += learning_rate * + W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); /* //UPDATE CLIPPING - W->row(update_item) += (learning_rate * + W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) .unaryExpr(Clipper()).matrix(); */ @@ -937,36 +936,36 @@ class Input_word_embeddings template <typename DerivedGOut, typename DerivedIn> void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, double L2_reg, double conditioning_constant, double decay) { int embedding_dimension = W->cols(); - //W_gradient.setZero(W->rows(), W->cols()); + //W_gradient.setZero(W->rows(), W->cols()); /* if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); - } + } int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { + for (int ngram=0; ngram<context_size; ngram++) + { for (int train_id=0; train_id<input_words.cols(); train_id++) { update_map[input_words(ngram,train_id)] = 1; } } - // Convert to std::vector for parallelization + // Convert to std::vector for parallelization std::vector<int> update_items; for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { @@ -1006,16 +1005,15 @@ class Input_word_embeddings int x, int minibatch_size, const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here { - UNCONST(DerivedGW, gradient, my_gradient); + UNCONST(DerivedGW, gradient, my_gradient); int embedding_dimension = W->cols(); - my_gradient.setZero(); - for (int ngram=0; ngram<context_size; ngram++) - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + my_gradient.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), my_gradient); } }; } // namespace nplm - diff --git a/src/neuralLM.h b/src/neuralLM.h index 2004596..c18485f 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -6,7 +6,6 @@ #include <cstdlib> #include <boost/shared_ptr.hpp> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "util.h" @@ -16,109 +15,109 @@ /* To do: - move digit mapping into vocabulary.h - */ +*/ namespace nplm { class neuralLM : public neuralNetwork { - char map_digits; - boost::shared_ptr<vocabulary> vocab; - int start, null; + char map_digits; + boost::shared_ptr<vocabulary> vocab; + int start, null; -public: - neuralLM() + public: + neuralLM() : neuralNetwork(), vocab(new vocabulary()), - map_digits(0) - { - } + map_digits(0) + { + } - void set_map_digits(char value) { map_digits = value; } + void set_map_digits(char value) { map_digits = value; } - void set_vocabulary(const vocabulary &vocab) - { - *(this->vocab) = vocab; - start = vocab.lookup_word("<s>"); - null = vocab.lookup_word("<null>"); - } + void set_vocabulary(const vocabulary &vocab) + { + *(this->vocab) = vocab; + start = vocab.lookup_word("<s>"); + null = vocab.lookup_word("<null>"); + } - const vocabulary &get_vocabulary() const { return *(this->vocab); } + const vocabulary &get_vocabulary() const { return *(this->vocab); } - int lookup_word(const std::string &word) const - { - if (map_digits) - for (int i=0; i<word.length(); i++) - if (isdigit(word[i])) - { - std::string mapped_word(word); - for (; i<word.length(); i++) - if (isdigit(word[i])) - mapped_word[i] = map_digits; - return vocab->lookup_word(mapped_word); - } - return vocab->lookup_word(word); - } + int lookup_word(const std::string &word) const + { + if (map_digits) + for (int i=0; i<word.length(); i++) + if (isdigit(word[i])) + { + std::string mapped_word(word); + for (; i<word.length(); i++) + if (isdigit(word[i])) + mapped_word[i] = map_digits; + return vocab->lookup_word(mapped_word); + } + return vocab->lookup_word(word); + } - double lookup_ngram(const int *ngram_a, int n) + double lookup_ngram(const int *ngram_a, int n) + { + Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); + for (int i=0; i<m->ngram_size; i++) { - Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); - for (int i=0; i<m->ngram_size; i++) - { - if (i-m->ngram_size+n < 0) - { - if (ngram_a[0] == start) - ngram(i) = start; - else - ngram(i) = null; - } - else - { - ngram(i) = ngram_a[i-m->ngram_size+n]; - } - } - return neuralNetwork::lookup_ngram(ngram); + if (i-m->ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-m->ngram_size+n]; + } } + return neuralNetwork::lookup_ngram(ngram); + } - double lookup_ngram(const std::vector<int> &ngram_v) - { - return lookup_ngram(ngram_v.data(), ngram_v.size()); - } + double lookup_ngram(const std::vector<int> &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } - template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) - { - return neuralNetwork::lookup_ngram(ngram); - } - - template <typename DerivedA, typename DerivedB> - void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) - { - return neuralNetwork::lookup_ngram(ngram, log_probs_const); - } + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + return neuralNetwork::lookup_ngram(ngram); + } - void read(const std::string &filename) - { - std::vector<std::string> words; - m->read(filename, words); - set_vocabulary(vocabulary(words)); - resize(); - // this is faster but takes more memory - //m->premultiply(); - } + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); + } + + void read(const std::string &filename) + { + std::vector<std::string> words; + m->read(filename, words); + set_vocabulary(vocabulary(words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } }; template <typename T> void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop) { - output.clear(); - output.resize(input.size()+ngram_size); - for (int i=0; i<ngram_size-1; i++) - output[i] = start; - std::copy(input.begin(), input.end(), output.begin()+ngram_size-1); - output[output.size()-1] = stop; + output.clear(); + output.resize(input.size()+ngram_size); + for (int i=0; i<ngram_size-1; i++) + output[i] = start; + std::copy(input.begin(), input.end(), output.begin()+ngram_size-1); + output[output.size()-1] = stop; } template <typename T> @@ -127,21 +126,21 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu output.clear(); for (int j=ngram_size-1; j<input.size(); j++) { - std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); - output.push_back(ngram); + std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); + output.push_back(ngram); } } -inline void preprocessWords(const std::vector<std::string> &words, - std::vector< std::vector<int> > &ngrams, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize) { +inline void preprocessWords(const std::vector<std::string> &words, + std::vector< std::vector<int> > &ngrams, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize) { int start = vocab.lookup_word("<s>"); int stop = vocab.lookup_word("</s>"); - + // convert words to ints std::vector<int> nums; if (numberize) { @@ -152,9 +151,9 @@ inline void preprocessWords(const std::vector<std::string> &words, else { for (int j=0; j<words.size(); j++) { nums.push_back(boost::lexical_cast<int>(words[j])); - } + } } - + // convert sequence to n-grams ngrams.clear(); if (ngramize) { @@ -168,10 +167,10 @@ inline void preprocessWords(const std::vector<std::string> &words, } else { if (nums.size() != ngram_size) - { - std::cerr << "error: wrong number of fields in line" << std::endl; - std::exit(1); - } + { + std::cerr << "error: wrong number of fields in line" << std::endl; + std::exit(1); + } ngrams.push_back(nums); } } diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h index ef96488..6386a0f 100644 --- a/src/neuralNetwork.h +++ b/src/neuralNetwork.h @@ -3,7 +3,6 @@ #include <vector> #include <boost/shared_ptr.hpp> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "util.h" @@ -16,191 +15,191 @@ namespace nplm class neuralNetwork { -protected: - boost::shared_ptr<model> m; + protected: + boost::shared_ptr<model> m; -private: - bool normalization; - double weight; + private: + bool normalization; + double weight; - propagator prop; + propagator prop; - std::size_t cache_size; - Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; - std::vector<double> cache_values; - int cache_lookups, cache_hits; + std::size_t cache_size; + Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; + std::vector<double> cache_values; + int cache_lookups, cache_hits; -public: - neuralNetwork() + public: + neuralNetwork() : m(new model()), normalization(false), - weight(1.), - prop(*m, 1), + weight(1.), + prop(*m, 1), cache_size(0) - { - } + { + } - void set_normalization(bool value) { normalization = value; } - void set_log_base(double value) { weight = 1./std::log(value); } - - // This must be called if the underlying model is resized. - void resize() { - if (cache_size) - { - cache_keys.resize(m->ngram_size, cache_size); - cache_keys.fill(-1); - } - prop.resize(); - } + void set_normalization(bool value) { normalization = value; } + void set_log_base(double value) { weight = 1./std::log(value); } - void set_width(int width) + // This must be called if the underlying model is resized. + void resize() { + if (cache_size) { - prop.resize(width); + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); } - - template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + prop.resize(); + } + + void set_width(int width) + { + prop.resize(width); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + assert (ngram.rows() == m->ngram_size); + assert (ngram.cols() == 1); + + std::size_t hash; + if (cache_size) { - assert (ngram.rows() == m->ngram_size); - assert (ngram.cols() == 1); - - std::size_t hash; - if (cache_size) - { - // First look in cache - hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h - cache_lookups++; - if (cache_keys.col(hash) == ngram) - { - cache_hits++; - return cache_values[hash]; - } - } - - // Make sure that we're single threaded. Multithreading doesn't help, - // and in some cases can hurt quite a lot - int save_threads = omp_get_max_threads(); - omp_set_num_threads(1); - int save_eigen_threads = Eigen::nbThreads(); - Eigen::setNbThreads(1); - #ifdef __INTEL_MKL__ - int save_mkl_threads = mkl_get_max_threads(); - mkl_set_num_threads(1); - #endif - - prop.fProp(ngram.col(0)); - - int output = ngram(m->ngram_size-1, 0); - double log_prob; - - start_timer(3); - if (normalization) - { - Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - double logz = logsum(scores.col(0)); - log_prob = weight * (scores(output, 0) - logz); - } - else - { - if (prop.skip_hidden) - log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); - else - log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); - } - stop_timer(3); - - if (cache_size) - { - // Update cache - cache_keys.col(hash) = ngram; - cache_values[hash] = log_prob; - } - - #ifdef __INTEL_MKL__ - mkl_set_num_threads(save_mkl_threads); - #endif - Eigen::setNbThreads(save_eigen_threads); - omp_set_num_threads(save_threads); - - return log_prob; + // First look in cache + hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h + cache_lookups++; + if (cache_keys.col(hash) == ngram) + { + cache_hits++; + return cache_values[hash]; + } } - // Look up many n-grams in parallel. - template <typename DerivedA, typename DerivedB> - void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) - { - UNCONST(DerivedB, log_probs_const, log_probs); - assert (ngram.rows() == m->ngram_size); - //assert (ngram.cols() <= prop.get_minibatch_size()); - - prop.fProp(ngram); - - if (normalization) - { - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - - // And softmax and loss - Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); - double minibatch_log_likelihood; - SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); - for (int j=0; j<ngram.cols(); j++) - { - int output = ngram(m->ngram_size-1, j); - log_probs(0, j) = weight * output_probs(output, j); - } - } - else - { - for (int j=0; j<ngram.cols(); j++) - { - int output = ngram(m->ngram_size-1, j); - if (prop.skip_hidden) - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); - else - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); - } - } - } + // Make sure that we're single threaded. Multithreading doesn't help, + // and in some cases can hurt quite a lot + int save_threads = omp_get_max_threads(); + omp_set_num_threads(1); + int save_eigen_threads = Eigen::nbThreads(); + Eigen::setNbThreads(1); +#ifdef __INTEL_MKL__ + int save_mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); +#endif + + prop.fProp(ngram.col(0)); - int get_order() const { return m->ngram_size; } + int output = ngram(m->ngram_size-1, 0); + double log_prob; - void read(const std::string &filename) + start_timer(3); + if (normalization) { - m->read(filename); - resize(); - // this is faster but takes more memory - //m->premultiply(); + Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + double logz = logsum(scores.col(0)); + log_prob = weight * (scores(output, 0) - logz); } - - void set_cache(std::size_t cache_size) + else { - this->cache_size = cache_size; - cache_keys.resize(m->ngram_size, cache_size); - cache_keys.fill(-1); // clears cache - cache_values.resize(cache_size); - cache_lookups = cache_hits = 0; + if (prop.skip_hidden) + log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); + else + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); } + stop_timer(3); - double cache_hit_rate() + if (cache_size) { - return static_cast<double>(cache_hits)/cache_lookups; + // Update cache + cache_keys.col(hash) = ngram; + cache_values[hash] = log_prob; } - void premultiply() +#ifdef __INTEL_MKL__ + mkl_set_num_threads(save_mkl_threads); +#endif + Eigen::setNbThreads(save_eigen_threads); + omp_set_num_threads(save_threads); + + return log_prob; + } + + // Look up many n-grams in parallel. + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + UNCONST(DerivedB, log_probs_const, log_probs); + assert (ngram.rows() == m->ngram_size); + //assert (ngram.cols() <= prop.get_minibatch_size()); + + prop.fProp(ngram); + + if (normalization) + { + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + + // And softmax and loss + Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); + double minibatch_log_likelihood; + SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(m->ngram_size-1, j); + log_probs(0, j) = weight * output_probs(output, j); + } + } + else + { + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(m->ngram_size-1, j); + if (prop.skip_hidden) + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); + else + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + } + } + } + + int get_order() const { return m->ngram_size; } + + void read(const std::string &filename) + { + m->read(filename); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } + + void set_cache(std::size_t cache_size) + { + this->cache_size = cache_size; + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); // clears cache + cache_values.resize(cache_size); + cache_lookups = cache_hits = 0; + } + + double cache_hit_rate() + { + return static_cast<double>(cache_hits)/cache_lookups; + } + + void premultiply() + { + if (!m->premultiplied) { - if (!m->premultiplied) - { - m->premultiply(); - } + m->premultiply(); } + } }; diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp index a2cac7a..d5fc16b 100644 --- a/src/prepareNeuralLM.cpp +++ b/src/prepareNeuralLM.cpp @@ -2,19 +2,19 @@ #include <vector> #include <queue> #include <deque> -# include <fstream> -# include <iterator> - -# include <boost/unordered_map.hpp> -# include <boost/algorithm/string/join.hpp> -# include <boost/interprocess/managed_shared_memory.hpp> -# include <boost/interprocess/allocators/allocator.hpp> -# include <boost/interprocess/managed_mapped_file.hpp> +#include <fstream> +#include <iterator> + +#include <boost/unordered_map.hpp> +#include <boost/algorithm/string/join.hpp> +#include <boost/interprocess/managed_shared_memory.hpp> +#include <boost/interprocess/allocators/allocator.hpp> +#include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/containers/vector.hpp> #include <boost/random/mersenne_twister.hpp> #include <boost/random/uniform_int_distribution.hpp> -# include <tclap/CmdLine.h> +#include <tclap/CmdLine.h> #include "neuralLM.h" #include "util.h" @@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec; typedef long long int data_size_t; // training data can easily exceed 2G instances template<typename T> -void writeNgrams(const T &data, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename) - { - ofstream file(filename.c_str()); - if (!file) +void writeNgrams(const T &data, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename) +{ + ofstream file(filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + vector<vector<int> > ngrams; + + for (int i=0; i<data.size(); i++) { + preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize); + // write out n-grams + for (int j=0; j<ngrams.size(); j++) { - cerr << "error: could not open " << filename << endl; - exit(1); - } - - vector<vector<int> > ngrams; - - for (int i=0; i<data.size(); i++) { - preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize); - // write out n-grams - for (int j=0; j<ngrams.size(); j++) - { - for (int k=0; k<ngram_size; k++) - { - file << ngrams[j][k] << " "; - } - file << endl; - } + for (int k=0; k<ngram_size; k++) + { + file << ngrams[j][k] << " "; + } + file << endl; } - file.close(); + } + file.close(); } // Space efficient version for writing the n-grams. // They are not read into memory. -void writeNgrams(const string &input_filename, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename, - int train_data_size, - vector<float> &sent_weights, - const string &sent_weights_filename) +void writeNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + int train_data_size, + vector<float> &sent_weights, + const string &sent_weights_filename) { - ofstream file(filename.c_str()); - ofstream output_sent_weights_file(sent_weights_filename.c_str()); - if (!file) - { - cerr << "error: could not open " << filename << endl; - exit(1); + ofstream file(filename.c_str()); + ofstream output_sent_weights_file(sent_weights_filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + ifstream input_file(input_filename.c_str()); + vector<vector<int> > ngrams; + //for (int i=0; i<train_data.size(); i++) { + string line; + int counter = 0; + cerr<<"Processed ... "; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) == 0) { + cerr<<counter<<" training lines ... "; } - - ifstream input_file(input_filename.c_str()); - vector<vector<int> > ngrams; - //for (int i=0; i<train_data.size(); i++) { - string line; - int counter = 0; - cerr<<"Processed ... "; - while (getline(input_file,line) && train_data_size-- > 0) { - counter++; - if ((counter % 100000) == 0) { - cerr<<counter<<" training lines ... "; - } - //stringstream lstr(line); - vector<string> lstr_items; - splitBySpace(line,lstr_items); + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); //for (int i=0; i<data.size(); i++) { - preprocessWords(lstr_items, - ngrams, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize); - - // write out n-grams - for (int j=0; j<ngrams.size(); j++) - { - if (sent_weights.size() != 0) { - output_sent_weights_file <<sent_weights[counter-1]<<endl; - } - for (int k=0; k<ngram_size; k++) - { - file << ngrams[j][k] << " "; - } - file << endl; - } + preprocessWords(lstr_items, + ngrams, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize); + + // write out n-grams + for (int j=0; j<ngrams.size(); j++) + { + if (sent_weights.size() != 0) { + output_sent_weights_file <<sent_weights[counter-1]<<endl; + } + for (int k=0; k<ngram_size; k++) + { + file << ngrams[j][k] << " "; + } + file << endl; } - cerr<<endl; - input_file.close(); - file.close(); - output_sent_weights_file.close(); + } + cerr<<endl; + input_file.close(); + file.close(); + output_sent_weights_file.close(); } // Space efficient version for writing the n-grams. // They are not read into memory. -void writeMmapNgrams(const string &input_filename, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename, - unsigned long train_data_size, - data_size_t num_tokens, - bool randomize) +void writeMmapNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + unsigned long train_data_size, + data_size_t num_tokens, + bool randomize) { - cerr<<"Num tokens is "<<num_tokens<<endl; - cerr<<"Training data size is "<<train_data_size<<endl; - // Open the memory mapped file and create the allocators - ip::managed_mapped_file mfile(ip::create_only, - filename.c_str(), - num_tokens*ngram_size*sizeof(int)+1024UL*1024UL); - intAllocator ialloc(mfile.get_segment_manager()); - vecAllocator valloc (mfile.get_segment_manager()); - //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc); - - vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc); - - cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl; - // Going over every line in the input file and - // printing the memory mapped ngrams into the - // output file - ifstream input_file(input_filename.c_str()); - //for (int i=0; i<train_data.size(); i++) { - string line; - int counter = 0; - cerr<<"Processed ... "; - long int train_ngram_counter = 0; - vector<vector<int> > ngrams; - while (getline(input_file,line) && train_data_size-- > 0) { - counter++; - if ((counter % 100000) ==0) { - //cerr<<"counter is "<<counter<<endl; - cerr<<counter<<" training lines ... "; - } - //stringstream lstr(line); - vector<string> lstr_items; - splitBySpace(line,lstr_items); + cerr<<"Num tokens is "<<num_tokens<<endl; + cerr<<"Training data size is "<<train_data_size<<endl; + // Open the memory mapped file and create the allocators + ip::managed_mapped_file mfile(ip::create_only, + filename.c_str(), + num_tokens*ngram_size*sizeof(int)+1024UL*1024UL); + intAllocator ialloc(mfile.get_segment_manager()); + vecAllocator valloc (mfile.get_segment_manager()); + //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc); + + vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc); + + cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl; + // Going over every line in the input file and + // printing the memory mapped ngrams into the + // output file + ifstream input_file(input_filename.c_str()); + //for (int i=0; i<train_data.size(); i++) { + string line; + int counter = 0; + cerr<<"Processed ... "; + long int train_ngram_counter = 0; + vector<vector<int> > ngrams; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) ==0) { + //cerr<<"counter is "<<counter<<endl; + cerr<<counter<<" training lines ... "; + } + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); //for (int i=0; i<data.size(); i++) { - preprocessWords(lstr_items, ngrams, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize); - /* + preprocessWords(lstr_items, ngrams, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize); + /* cerr<<"line is "<<endl; cerr<<line<<endl; cerr<<"Number of ngrams is "<<ngrams.size()<<endl; - if (ngrams.size() ==1 ){ - cerr<<"The line number was "<<counter<<endl; - cerr<<line<<endl; + if (ngrams.size() ==1 ){ + cerr<<"The line number was "<<counter<<endl; + cerr<<line<<endl; + } + */ + // write out n-grams in mmapped file + for (int j=0; j<ngrams.size(); j++) + { + /* + for (int k=0; k<ngram_size; k++) + { + cerr << ngrams[j][k] << " "; } + cerr<< endl; */ - // write out n-grams in mmapped file - for (int j=0; j<ngrams.size(); j++) - { - /* - for (int k=0; k<ngram_size; k++) - { - cerr << ngrams[j][k] << " "; - } - cerr<< endl; - */ - for (int k=0; k<ngram_size; k++) { - mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k]; - } - train_ngram_counter++; - //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl; - } + for (int k=0; k<ngram_size; k++) { + mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k]; + } + train_ngram_counter++; + //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl; } - cerr<<endl; - input_file.close(); - - // Shrink the file if it was overused - ip::managed_mapped_file::shrink_to_fit(filename.c_str()); - //now to randomize the items if the randomize flag was set - if (randomize == true) { - unsigned seed = 1234; //for testing only - boost::random::mt19937 rng(seed); - cerr<<"Randomly shuffling data..."; - data_size_t counter =0; - while (counter < num_tokens) { - data_size_t upper_limit = counter+5000000; - long int vector_size = 5000000; - if (counter + 10000000 >= num_tokens) { - upper_limit = num_tokens; - vector_size = num_tokens - counter; - } - vector<int> temp(vector_size*ngram_size,0); - for (int i=0;i<vector_size;i++){ - for (int k=0;k<ngram_size;k++) { - temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k); - } - } - for (data_size_t i=vector_size-1; i>0; i--) - { - if (i %500000 == 0) { - cerr<<"Shuffled "<<num_tokens-1<<" instances..."; - } - data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); - for (int k=0;k<ngram_size;k++) { - int temp_val = temp.at(i*ngram_size+k); - temp.at(i*ngram_size+k) = - temp.at(j*ngram_size+k); - temp.at(j*ngram_size+k) = temp_val; - } - } - //Putting it back - for (int i=0;i<vector_size;i++){ - for (int k=0;k<ngram_size;k++) { - mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; - } - } - counter = upper_limit; + } + cerr<<endl; + input_file.close(); + + // Shrink the file if it was overused + ip::managed_mapped_file::shrink_to_fit(filename.c_str()); + //now to randomize the items if the randomize flag was set + if (randomize == true) { + unsigned seed = 1234; //for testing only + boost::random::mt19937 rng(seed); + cerr<<"Randomly shuffling data..."; + data_size_t counter =0; + while (counter < num_tokens) { + data_size_t upper_limit = counter+5000000; + long int vector_size = 5000000; + if (counter + 10000000 >= num_tokens) { + upper_limit = num_tokens; + vector_size = num_tokens - counter; + } + vector<int> temp(vector_size*ngram_size,0); + for (int i=0;i<vector_size;i++){ + for (int k=0;k<ngram_size;k++) { + temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k); } - - /* - for (data_size_t i=num_tokens-1; i>0; i--) + } + for (data_size_t i=vector_size-1; i>0; i--) { if (i %500000 == 0) { cerr<<"Shuffled "<<num_tokens-1<<" instances..."; } data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); for (int k=0;k<ngram_size;k++) { - int temp_val = mMapVec->at(i*ngram_size+k); - mMapVec->at(i*ngram_size+k) = - mMapVec->at(j*ngram_size+k); - mMapVec->at(j*ngram_size+k) = temp_val; + int temp_val = temp.at(i*ngram_size+k); + temp.at(i*ngram_size+k) = + temp.at(j*ngram_size+k); + temp.at(j*ngram_size+k) = temp_val; } } - */ - cerr<<endl; + //Putting it back + for (int i=0;i<vector_size;i++){ + for (int k=0;k<ngram_size;k++) { + mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; + } + } + counter = upper_limit; } + + /* + for (data_size_t i=num_tokens-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<num_tokens-1<<" instances..."; + } + data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<ngram_size;k++) { + int temp_val = mMapVec->at(i*ngram_size+k); + mMapVec->at(i*ngram_size+k) = + mMapVec->at(j*ngram_size+k); + mMapVec->at(j*ngram_size+k) = temp_val; + } + } + */ + cerr<<endl; + } } int main(int argc, char *argv[]) { - ios::sync_with_stdio(false); - int ngram_size, vocab_size, validation_size; - bool numberize, - ngramize, - add_start_stop, - mmap_file, - randomize; - - string train_text, - train_file, - validation_text, - validation_file, - words_file, - write_words_file, - sent_weights_text, - output_sent_weights_text; - - try - { - CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1"); - - // The options are printed in reverse order + ios::sync_with_stdio(false); + int ngram_size, vocab_size, validation_size; + bool numberize, + ngramize, + add_start_stop, + mmap_file, + randomize; + + string train_text, + train_file, + validation_text, + validation_file, + words_file, + write_words_file, + sent_weights_text, + output_sent_weights_text; + + try + { + CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1"); + + // The options are printed in reverse order ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is " - "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); + "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd); ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd); ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd); - ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); + ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd); - ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); - ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); - ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); - ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); - //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); - //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); - - - - cmd.parse(argc, argv); - - train_text = arg_train_text.getValue(); - train_file = arg_train_file.getValue(); - validation_text = arg_validation_text.getValue(); - validation_file = arg_validation_file.getValue(); - validation_size = arg_validation_size.getValue(); - write_words_file = arg_write_words_file.getValue(); - ngram_size = arg_ngram_size.getValue(); - vocab_size = arg_vocab_size.getValue(); - words_file = arg_words_file.getValue(); - numberize = arg_numberize.getValue(); - ngramize = arg_ngramize.getValue(); - add_start_stop = arg_add_start_stop.getValue(); - mmap_file = arg_mmap_file.getValue(); - randomize = arg_randomize.getValue(); - //sent_weights_text = arg_sent_weights_text.getValue(); - //output_sent_weights_text = arg_sent_weights_file.getValue(); - sent_weights_text = ""; - output_sent_weights_text = ""; + ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); + //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); + //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); + + + cmd.parse(argc, argv); + + train_text = arg_train_text.getValue(); + train_file = arg_train_file.getValue(); + validation_text = arg_validation_text.getValue(); + validation_file = arg_validation_file.getValue(); + validation_size = arg_validation_size.getValue(); + write_words_file = arg_write_words_file.getValue(); + ngram_size = arg_ngram_size.getValue(); + vocab_size = arg_vocab_size.getValue(); + words_file = arg_words_file.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); + mmap_file = arg_mmap_file.getValue(); + randomize = arg_randomize.getValue(); + //sent_weights_text = arg_sent_weights_text.getValue(); + //output_sent_weights_text = arg_sent_weights_file.getValue(); + sent_weights_text = ""; + output_sent_weights_text = ""; // check command line arguments @@ -364,292 +363,292 @@ int main(int argc, char *argv[]) cerr << "Command line: " << endl; cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; - - const string sep(" Value: "); - cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; - cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; - cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; - cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; - cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; - cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl; - cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; - cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl; - cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl; - cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; - cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; - cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; - cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl; - //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl; - } - catch (TCLAP::ArgException &e) - { - cerr << "error: " << e.error() << " for arg " << e.argId() << endl; - exit(1); - } - // VLF: why is this true? - // DC: it's because the vocabulary has to be constructed from the training data only. - // If the vocabulary is preset, we can't create the validation data. - // - if --numberize 0 is set, then --validation_size cannot be used. - // if (!numberize && (validation_size > 0)) { - // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; - // } - - // Read in training data and validation data - // vector<vector<string> > train_data; - // readSentFile(train_text, train_data); - // @vaswani: No more reading the entire training file into memory - // Reading it per line with file io - - //for (int i=0; i<train_data.size(); i++) { - // Go over every line in the file and - // 1. if the !ngramize then you should check if - // we have the correct number of items per line - // 2. build the vocabulary if the words file has not - // been specified. - // Construct vocabulary - vocabulary vocab; - int start, stop; - // Add start stop if the vocabulary has not been supplied - if (words_file == "") { - vocab.insert_word("<s>"); - vocab.insert_word("</s>"); - vocab.insert_word("<null>"); - // warn user that if --numberize is not set, there will be no vocabulary! - if (!numberize) { - cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; - } - } - if (mmap_file == false && randomize == true) { - cerr<<"Randomize option can only be used with mmap_file = 1"<<endl; - exit(1); + const string sep(" Value: "); + cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; + cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; + cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; + cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; + cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; + cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl; + cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; + cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl; + cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl; + cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl; + //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + // VLF: why is this true? + // DC: it's because the vocabulary has to be constructed from the training data only. + // If the vocabulary is preset, we can't create the validation data. + // - if --numberize 0 is set, then --validation_size cannot be used. + // if (!numberize && (validation_size > 0)) { + // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; + // } + + // Read in training data and validation data + // vector<vector<string> > train_data; + // readSentFile(train_text, train_data); + // @vaswani: No more reading the entire training file into memory + // Reading it per line with file io + + //for (int i=0; i<train_data.size(); i++) { + // Go over every line in the file and + // 1. if the !ngramize then you should check if + // we have the correct number of items per line + // 2. build the vocabulary if the words file has not + // been specified. + // Construct vocabulary + vocabulary vocab; + int start, stop; + // Add start stop if the vocabulary has not been supplied + if (words_file == "") { + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; } - unordered_map<string,int> count; // For keeping word counts if no supplied vocab - - deque<vector<string> > validation_data; - int train_data_size=0; - cerr<<"Processed ... "; - data_size_t num_tokens=0; - - ifstream training(train_text.c_str()); - - string line; - while (getline(training,line)) { - train_data_size++; - //stringstream lstr(line); - vector<string> lstr_items; - splitBySpace(line,lstr_items); - // if data is already ngramized, set/check ngram_size - if (!ngramize) { - if (ngram_size > 0) { - if (ngram_size != lstr_items.size()) { - cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=lstr_items.size(); - } + } + if (mmap_file == false && randomize == true) { + cerr<<"Randomize option can only be used with mmap_file = 1"<<endl; + exit(1); + } + unordered_map<string,int> count; // For keeping word counts if no supplied vocab + + deque<vector<string> > validation_data; + int train_data_size=0; + cerr<<"Processed ... "; + data_size_t num_tokens=0; + + ifstream training(train_text.c_str()); + + string line; + while (getline(training,line)) { + train_data_size++; + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + if (ngram_size > 0) { + if (ngram_size != lstr_items.size()) { + cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; + } } - if ((train_data_size%100000)==0){ - cerr<<train_data_size<<" lines ... "; + // else if --ngram_size has not been specified, set it now + else { + ngram_size=lstr_items.size(); } - //break; - /* + } + if ((train_data_size%100000)==0){ + cerr<<train_data_size<<" lines ... "; + } + //break; + /* if (lstr_items.size() ==1) { - cerr<<"line :"<<endl; - cerr<<line<<endl; - cerr<<"The number of items was 1"<<endl; - getchar(); - } - */ - num_tokens += lstr_items.size()+1; - if (words_file == "") { - for (int j=0; j<lstr_items.size(); j++) { - count[lstr_items[j]] += 1; - } + cerr<<"line :"<<endl; + cerr<<line<<endl; + cerr<<"The number of items was 1"<<endl; + getchar(); } - // Add to validation set if the validation size - // has not been specified - if (validation_text == "" && validation_size > 0) { - //cerr<<"validation size is "<<validation_data.size()<<endl; - if (validation_data.size() == validation_size) { - //validation_data.erase(validation_data.begin()); - validation_data.pop_front(); - } - validation_data.push_back(lstr_items); + */ + num_tokens += lstr_items.size()+1; + if (words_file == "") { + for (int j=0; j<lstr_items.size(); j++) { + count[lstr_items[j]] += 1; } } - cerr<<endl; - training.close(); - //cerr<<"validation size is "<<validation_data.size()<<endl; - //getchar(); - if (validation_data.size() < validation_size) { - cerr<<"validation size is "<<validation_data.size()<<endl; - cerr << "error: requested validation size is greater than training data size" << endl; - exit(1); + // Add to validation set if the validation size + // has not been specified + if (validation_text == "" && validation_size > 0) { + //cerr<<"validation size is "<<validation_data.size()<<endl; + if (validation_data.size() == validation_size) { + //validation_data.erase(validation_data.begin()); + validation_data.pop_front(); + } + validation_data.push_back(lstr_items); } - - train_data_size -= validation_size; - cerr<<"Training data size is "<<train_data_size<<endl; - - // The items in the validation data have already been counted - // Decrementing the counts of those words before building the vocabulary - for(int i=0; i<validation_data.size(); i++){ - num_tokens -= (validation_data[i].size() +1); - for (int j=0; j<validation_data[i].size();j++){ - count[validation_data[i][j]] -= 1; - if (count[validation_data[i][j]] == 0) { - count.erase(validation_data[i][j]); - } + } + cerr<<endl; + training.close(); + //cerr<<"validation size is "<<validation_data.size()<<endl; + //getchar(); + if (validation_data.size() < validation_size) { + cerr<<"validation size is "<<validation_data.size()<<endl; + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); + } + + train_data_size -= validation_size; + cerr<<"Training data size is "<<train_data_size<<endl; + + // The items in the validation data have already been counted + // Decrementing the counts of those words before building the vocabulary + for(int i=0; i<validation_data.size(); i++){ + num_tokens -= (validation_data[i].size() +1); + for (int j=0; j<validation_data[i].size();j++){ + count[validation_data[i][j]] -= 1; + if (count[validation_data[i][j]] == 0) { + count.erase(validation_data[i][j]); } } + } - // Getting the top n frequent words for the vocabulary - if (words_file == "") { - vocab.insert_most_frequent(count, vocab_size); - if (vocab.size() < vocab_size) { - cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; - } + // Getting the top n frequent words for the vocabulary + if (words_file == "") { + vocab.insert_most_frequent(count, vocab_size); + if (vocab.size() < vocab_size) { + cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; } - //vector<vector<string> > validation_data; - if (validation_text != "") { - readSentFile(validation_text, validation_data); - for (int i=0; i<validation_data.size(); i++) { - // if data is already ngramized, set/check ngram_size - if (!ngramize) { - // if --ngram_size has been specified, check that it does not conflict with --ngram_size - if (ngram_size > 0) { - if (ngram_size != validation_data[i].size()) { - cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=validation_data[i].size(); - } - } + } + //vector<vector<string> > validation_data; + if (validation_text != "") { + readSentFile(validation_text, validation_data); + for (int i=0; i<validation_data.size(); i++) { + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + // if --ngram_size has been specified, check that it does not conflict with --ngram_size + if (ngram_size > 0) { + if (ngram_size != validation_data[i].size()) { + cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; + } } + // else if --ngram_size has not been specified, set it now + else { + ngram_size=validation_data[i].size(); + } + } } - //READING SENTENCE WEIGHTS IF THERE ARE ANY - vector<float> sent_weights; - if (sent_weights_text != "") { - cerr<<"Reading sentence weights from "<<sent_weights_text<<endl; - ifstream sent_weights_file(sent_weights_text.c_str()); - string line; - readWeightsFile(sent_weights_file,sent_weights); - sent_weights_file.close(); - if (sent_weights_text.size() != train_data_size) { - cerr<<"The number of sentence weights does not match the number of training sentences"<<endl; - } + } + //READING SENTENCE WEIGHTS IF THERE ARE ANY + vector<float> sent_weights; + if (sent_weights_text != "") { + cerr<<"Reading sentence weights from "<<sent_weights_text<<endl; + ifstream sent_weights_file(sent_weights_text.c_str()); + string line; + readWeightsFile(sent_weights_file,sent_weights); + sent_weights_file.close(); + if (sent_weights_text.size() != train_data_size) { + cerr<<"The number of sentence weights does not match the number of training sentences"<<endl; } - - /* + } + + /* else if (validation_size > 0) { - // Create validation data - if (validation_size > train_data.size()) - { - cerr << "error: requested validation size is greater than training data size" << endl; - exit(1); - } - validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); - train_data.resize(train_data.size() - validation_size); + // Create validation data + if (validation_size > train_data.size()) + { + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); } - */ - - // Construct vocabulary - //vocabulary vocab; - //int start, stop; - - // read vocabulary from file - if (words_file != "") { - vector<string> words; - readWordsFile(words_file,words); - for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { - vocab.insert_word(*it); - } - - // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file - if (vocab_size > 0) { - if (vocab.size() != vocab_size) { - cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; - } - } - // else, set it to the size of vocabulary read from file - else { - vocab_size = vocab.size(); - } - + validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); + train_data.resize(train_data.size() - validation_size); } - /* - // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk> - else { - vocab.insert_word("<s>"); - vocab.insert_word("</s>"); - vocab.insert_word("<null>"); - - // warn user that if --numberize is not set, there will be no vocabulary! - if (!numberize) { - cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; - } - unordered_map<string,int> count; - for (int i=0; i<train_data.size(); i++) { - for (int j=0; j<train_data[i].size(); j++) { - count[train_data[i][j]] += 1; - } - } - - vocab.insert_most_frequent(count, vocab_size); - if (vocab.size() < vocab_size) { - cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; - } + */ + + // Construct vocabulary + //vocabulary vocab; + //int start, stop; + + // read vocabulary from file + if (words_file != "") { + vector<string> words; + readWordsFile(words_file,words); + for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { + vocab.insert_word(*it); } - */ - // write vocabulary to file - if (write_words_file != "") { - cerr << "Writing vocabulary to " << write_words_file << endl; - writeWordsFile(vocab.words(), write_words_file); + // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file + if (vocab_size > 0) { + if (vocab.size() != vocab_size) { + cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; + } } - - // Write out numberized n-grams - if (train_file != "") - { - cerr << "Writing training data to " << train_file << endl; - if (mmap_file == true) { - writeMmapNgrams(train_text, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - train_file, - train_data_size, - num_tokens, - randomize); - } else { - writeNgrams(train_text, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - train_file, - train_data_size, - sent_weights, - output_sent_weights_text); - } + // else, set it to the size of vocabulary read from file + else { + vocab_size = vocab.size(); } - if (validation_file != "") - { - cerr << "Writing validation data to " << validation_file << endl; - writeNgrams(validation_data, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - validation_file); + + } + /* + // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk> + else { + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); + + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; + } + unordered_map<string,int> count; + for (int i=0; i<train_data.size(); i++) { + for (int j=0; j<train_data[i].size(); j++) { + count[train_data[i][j]] += 1; + } + } + + vocab.insert_most_frequent(count, vocab_size); + if (vocab.size() < vocab_size) { + cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; + } + } + */ + + // write vocabulary to file + if (write_words_file != "") { + cerr << "Writing vocabulary to " << write_words_file << endl; + writeWordsFile(vocab.words(), write_words_file); + } + + // Write out numberized n-grams + if (train_file != "") + { + cerr << "Writing training data to " << train_file << endl; + if (mmap_file == true) { + writeMmapNgrams(train_text, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + train_file, + train_data_size, + num_tokens, + randomize); + } else { + writeNgrams(train_text, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + train_file, + train_data_size, + sent_weights, + output_sent_weights_text); } + } + if (validation_file != "") + { + cerr << "Writing validation data to " << validation_file << endl; + writeNgrams(validation_data, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + validation_file); + } } diff --git a/src/propagator.h b/src/propagator.h index 9f214de..6344f2f 100644 --- a/src/propagator.h +++ b/src/propagator.h @@ -13,360 +13,359 @@ using Eigen::MatrixBase; using Eigen::Dynamic; class propagator { - int minibatch_size; - model *pnn; - -public: - Node<Input_word_embeddings> input_layer_node; - Node<Linear_layer> first_hidden_linear_node; - Node<Activation_function> first_hidden_activation_node; - Node<Linear_layer> second_hidden_linear_node; - Node<Activation_function> second_hidden_activation_node; - Node<Output_word_embeddings> output_layer_node; - bool skip_hidden; - -public: - propagator () : minibatch_size(0), pnn(0) { } - - propagator (model &nn, int minibatch_size) + int minibatch_size; + model *pnn; + + public: + Node<Input_word_embeddings> input_layer_node; + Node<Linear_layer> first_hidden_linear_node; + Node<Activation_function> first_hidden_activation_node; + Node<Linear_layer> second_hidden_linear_node; + Node<Activation_function> second_hidden_activation_node; + Node<Output_word_embeddings> output_layer_node; + bool skip_hidden; + + public: + propagator () : minibatch_size(0), pnn(0) { } + + propagator (model &nn, int minibatch_size) : - pnn(&nn), - input_layer_node(&nn.input_layer, minibatch_size), - first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), - first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), - second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), - second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), - output_layer_node(&nn.output_layer, minibatch_size), - minibatch_size(minibatch_size) - { - skip_hidden = (nn.num_hidden == 0); - } + pnn(&nn), + input_layer_node(&nn.input_layer, minibatch_size), + first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), + first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), + second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), + second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), + output_layer_node(&nn.output_layer, minibatch_size), + minibatch_size(minibatch_size) + { + skip_hidden = (nn.num_hidden == 0); + } - // This must be called if the underlying model is resized. - void resize(int minibatch_size) { - this->minibatch_size = minibatch_size; - input_layer_node.resize(minibatch_size); - first_hidden_linear_node.resize(minibatch_size); - first_hidden_activation_node.resize(minibatch_size); - second_hidden_linear_node.resize(minibatch_size); - second_hidden_activation_node.resize(minibatch_size); - output_layer_node.resize(minibatch_size); - } + // This must be called if the underlying model is resized. + void resize(int minibatch_size) { + this->minibatch_size = minibatch_size; + input_layer_node.resize(minibatch_size); + first_hidden_linear_node.resize(minibatch_size); + first_hidden_activation_node.resize(minibatch_size); + second_hidden_linear_node.resize(minibatch_size); + second_hidden_activation_node.resize(minibatch_size); + output_layer_node.resize(minibatch_size); + } - void resize() { resize(minibatch_size); } + void resize() { resize(minibatch_size); } - template <typename Derived> - void fProp(const MatrixBase<Derived> &data) + template <typename Derived> + void fProp(const MatrixBase<Derived> &data) + { + if (!pnn->premultiplied) { - if (!pnn->premultiplied) - { - start_timer(0); - input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); - stop_timer(0); - - start_timer(1); - first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, - first_hidden_linear_node.fProp_matrix); - } - else - { - int n_inputs = first_hidden_linear_node.param->n_inputs(); - USCMatrix<double> sparse_data; - input_layer_node.param->munge(data, sparse_data); - - start_timer(1); - first_hidden_linear_node.param->fProp(sparse_data, - first_hidden_linear_node.fProp_matrix); - } - first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); - //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl; - //std::getchar(); - stop_timer(1); - - - if (!skip_hidden) { - start_timer(2); - second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, - second_hidden_linear_node.fProp_matrix); - second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); - stop_timer(2); - } - - // The propagation stops here because the last layer is very expensive. - } + start_timer(0); + input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); + stop_timer(0); - // Dense version (for standard log-likelihood) - template <typename DerivedIn, typename DerivedOut> - void bProp(const MatrixBase<DerivedIn> &data, - const MatrixBase<DerivedOut> &output, - double learning_rate, - double momentum, - double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) + start_timer(1); + first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, + first_hidden_linear_node.fProp_matrix); + } + else { - // Output embedding layer - - start_timer(7); - output_layer_node.param->bProp(output, - output_layer_node.bProp_matrix); - stop_timer(7); - - start_timer(8); - Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; - if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, - output, - learning_rate, - momentum); - } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, - output, - learning_rate); - } else if (parameter_update == "ADAD") { - //std::cerr<<"Adadelta gradient"<<endl; - int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); - output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, - output, - 1.0/current_minibatch_size, - conditioning_constant, - decay); - } else { - std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; - } - stop_timer(8); - - bPropRest(data, - learning_rate, - momentum, - L2_reg, - parameter_update, - conditioning_constant, - decay); + int n_inputs = first_hidden_linear_node.param->n_inputs(); + USCMatrix<double> sparse_data; + input_layer_node.param->munge(data, sparse_data); + + start_timer(1); + first_hidden_linear_node.param->fProp(sparse_data, + first_hidden_linear_node.fProp_matrix); + } + first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl; + //std::getchar(); + stop_timer(1); + + + if (!skip_hidden) { + start_timer(2); + second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, + second_hidden_linear_node.fProp_matrix); + second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + stop_timer(2); } - // Sparse version (for NCE log-likelihood) - template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> - void bProp(const MatrixBase<DerivedIn> &data, - const MatrixBase<DerivedOutI> &samples, - const MatrixBase<DerivedOutV> &weights, - double learning_rate, - double momentum, - double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) - { + // The propagation stops here because the last layer is very expensive. + } + + // Dense version (for standard log-likelihood) + template <typename DerivedIn, typename DerivedOut> + void bProp(const MatrixBase<DerivedIn> &data, + const MatrixBase<DerivedOut> &output, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) + { + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(output, + output_layer_node.bProp_matrix); + stop_timer(7); + + start_timer(8); + Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, + output, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, + output, + learning_rate); + } else if (parameter_update == "ADAD") { + //std::cerr<<"Adadelta gradient"<<endl; + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); + output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, + output, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + } + stop_timer(8); - // Output embedding layer - - start_timer(7); - output_layer_node.param->bProp(samples, - weights, - output_layer_node.bProp_matrix); - stop_timer(7); - - - start_timer(8); - Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; - if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, - samples, - weights, - learning_rate, - momentum); - } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, - samples, - weights, - learning_rate); - } else if (parameter_update == "ADAD") { - int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); - //std::cerr<<"Adadelta gradient"<<endl; - output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, - samples, - weights, - 1.0/current_minibatch_size, - conditioning_constant, - decay); - } else { - std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); } - stop_timer(8); + // Sparse version (for NCE log-likelihood) + template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> + void bProp(const MatrixBase<DerivedIn> &data, + const MatrixBase<DerivedOutI> &samples, + const MatrixBase<DerivedOutV> &weights, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) + { - bPropRest(data, - learning_rate, - momentum, - L2_reg, - parameter_update, - conditioning_constant, - decay); + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(samples, + weights, + output_layer_node.bProp_matrix); + stop_timer(7); + + + start_timer(8); + Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<<endl; + output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, + samples, + weights, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; } -private: - template <typename DerivedIn> - void bPropRest(const MatrixBase<DerivedIn> &data, - double learning_rate, double momentum, double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) - { - // Second hidden layer + stop_timer(8); + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); + } - - // All the compute gradient functions are together and the backprop - // functions are together - ////////BACKPROP//////////// - start_timer(9); - if (skip_hidden) + private: + template <typename DerivedIn> + void bPropRest(const MatrixBase<DerivedIn> &data, + double learning_rate, double momentum, double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) { - start_timer(9); - first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + // Second hidden layer + + + + // All the compute gradient functions are together and the backprop + // functions are together + ////////BACKPROP//////////// + start_timer(9); + if (skip_hidden) + { + start_timer(9); + first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, first_hidden_activation_node.bProp_matrix, first_hidden_linear_node.fProp_matrix, first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(9); + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(9); - } - else - { - second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, - second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); + } + else + { + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); - second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.bProp_matrix); - stop_timer(9); + second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.bProp_matrix); + stop_timer(9); - start_timer(11); - first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, - first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); + start_timer(11); + first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(11); - } - //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl; - //std::getchar(); - ////COMPUTE GRADIENT///////// - if (parameter_update == "SGD") { - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, - momentum, - L2_reg); - stop_timer(10); + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(11); } - - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - learning_rate, momentum, L2_reg); - stop_timer(12); - - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, - data, - learning_rate, momentum, L2_reg); - stop_timer(13); - } else if (parameter_update == "ADA") { - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, - L2_reg); - stop_timer(10); + //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl; + //std::getchar(); + ////COMPUTE GRADIENT///////// + if (parameter_update == "SGD") { + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + momentum, + L2_reg); + stop_timer(10); + } + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, momentum, L2_reg); + stop_timer(13); + } else if (parameter_update == "ADA") { + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(10); + } + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, + L2_reg); + stop_timer(13); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<<endl; + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(10); + } + //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl; + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(12); + + //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl; + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix, + data, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(13); + + //std::cerr<<"Finished gradient for first input layer"<<std::endl; + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; } - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - learning_rate, - L2_reg); - stop_timer(12); - - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix, - data, - learning_rate, - L2_reg); - stop_timer(13); - } else if (parameter_update == "ADAD") { - int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); - //std::cerr<<"Adadelta gradient"<<endl; - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - 1.0/current_minibatch_size, - L2_reg, - conditioning_constant, - decay); - stop_timer(10); - } - //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl; - - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - 1.0/current_minibatch_size, - L2_reg, - conditioning_constant, - decay); - stop_timer(12); - - //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl; - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix, - data, - 1.0/current_minibatch_size, - L2_reg, - conditioning_constant, - decay); - stop_timer(13); - - //std::cerr<<"Finished gradient for first input layer"<<std::endl; - } else { - std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; } - - } }; } // namespace nplm #endif - diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp index 4f3713d..abaab34 100644 --- a/src/testNeuralLM.cpp +++ b/src/testNeuralLM.cpp @@ -6,7 +6,6 @@ #include <tclap/CmdLine.h> #include <Eigen/Core> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "param.h" @@ -21,174 +20,174 @@ using namespace Eigen; using namespace nplm; void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams, - vector<double> &out) { - if (ngrams.size() == 0) return; - int ngram_size = ngrams[0].size(); - - if (minibatch_size == 0) + vector<double> &out) { + if (ngrams.size() == 0) return; + int ngram_size = ngrams[0].size(); + + if (minibatch_size == 0) + { + // Score one n-gram at a time. This is how the LM would be queried from a decoder. + for (int sent_id=0; sent_id<start.size()-1; sent_id++) { - // Score one n-gram at a time. This is how the LM would be queried from a decoder. - for (int sent_id=0; sent_id<start.size()-1; sent_id++) - { - double sent_log_prob = 0.0; - for (int j=start[sent_id]; j<start[sent_id+1]; j++) - sent_log_prob += lm.lookup_ngram(ngrams[j]); - out.push_back(sent_log_prob); - } + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += lm.lookup_ngram(ngrams[j]); + out.push_back(sent_log_prob); } - else + } + else + { + // Score a whole minibatch at a time. + Matrix<double,1,Dynamic> log_probs(ngrams.size()); + + Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); + minibatch.setZero(); + for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) { - // Score a whole minibatch at a time. - Matrix<double,1,Dynamic> log_probs(ngrams.size()); - - Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); - minibatch.setZero(); - for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) - { - int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; - for (int j=0; j<current_minibatch_size; j++) - minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); - lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); - } - - for (int sent_id=0; sent_id<start.size()-1; sent_id++) - { - double sent_log_prob = 0.0; - for (int j=start[sent_id]; j<start[sent_id+1]; j++) - sent_log_prob += log_probs[j]; - out.push_back(sent_log_prob); - } + int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; + for (int j=0; j<current_minibatch_size; j++) + minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); + lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); } + + for (int sent_id=0; sent_id<start.size()-1; sent_id++) + { + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += log_probs[j]; + out.push_back(sent_log_prob); + } + } } -int main (int argc, char *argv[]) +int main (int argc, char *argv[]) { - param myParam; - bool normalization; - bool numberize, ngramize, add_start_stop; + param myParam; + bool normalization; + bool numberize, ngramize, add_start_stop; - try { - // program options // - CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1"); + try { + // program options // + CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1"); - ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); - ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd); + ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); + ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd); - ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); + ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); - ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd); + ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd); - ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd); + ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd); - cmd.parse(argc, argv); + cmd.parse(argc, argv); - myParam.model_file = arg_model_file.getValue(); - myParam.test_file = arg_test_file.getValue(); + myParam.model_file = arg_model_file.getValue(); + myParam.test_file = arg_test_file.getValue(); - normalization = arg_normalization.getValue(); - numberize = arg_numberize.getValue(); - ngramize = arg_ngramize.getValue(); - add_start_stop = arg_add_start_stop.getValue(); + normalization = arg_normalization.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); - myParam.minibatch_size = minibatch_size.getValue(); - myParam.num_threads = num_threads.getValue(); + myParam.minibatch_size = minibatch_size.getValue(); + myParam.num_threads = num_threads.getValue(); - cerr << "Command line: " << endl; - cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; - - const string sep(" Value: "); - cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl; - cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl; + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; - cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl; - cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; - cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + const string sep(" Value: "); + cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl; + cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl; - cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; - cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; - } - catch (TCLAP::ArgException &e) - { - cerr << "error: " << e.error() << " for arg " << e.argId() << endl; - exit(1); - } + cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; - myParam.num_threads = setup_threads(myParam.num_threads); + cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; + cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } - ///// Create language model + myParam.num_threads = setup_threads(myParam.num_threads); - neuralLM lm; - lm.read(myParam.model_file); - lm.set_normalization(normalization); - lm.set_log_base(10); - lm.set_cache(1048576); - int ngram_size = lm.get_order(); - int minibatch_size = myParam.minibatch_size; - if (minibatch_size) - lm.set_width(minibatch_size); + ///// Create language model - ///// Read test data - - ifstream test_file(myParam.test_file.c_str()); - if (!test_file) - { - cerr << "error: could not open " << myParam.test_file << endl; - exit(1); - } - string line; + neuralLM lm; + lm.read(myParam.model_file); + lm.set_normalization(normalization); + lm.set_log_base(10); + lm.set_cache(1048576); + int ngram_size = lm.get_order(); + int minibatch_size = myParam.minibatch_size; + if (minibatch_size) + lm.set_width(minibatch_size); - vector<int> start; - vector<vector<int> > ngrams; + ///// Read test data - while (getline(test_file, line)) - { - vector<string> words; - splitBySpace(line, words); + ifstream test_file(myParam.test_file.c_str()); + if (!test_file) + { + cerr << "error: could not open " << myParam.test_file << endl; + exit(1); + } + string line; - vector<vector<int> > sent_ngrams; - preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize); + vector<int> start; + vector<vector<int> > ngrams; - start.push_back(ngrams.size()); - copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams)); - } - start.push_back(ngrams.size()); + while (getline(test_file, line)) + { + vector<string> words; + splitBySpace(line, words); - int num_threads = 1; - vector< vector<double> > sent_log_probs(num_threads); + vector<vector<int> > sent_ngrams; + preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize); - /* - // Test thread safety - boost::thread_group tg; - for (int t=0; t < num_threads; t++) { - tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm - } - tg.join_all(); - */ - score(lm, minibatch_size, start, ngrams, sent_log_probs[0]); - - vector<double> log_likelihood(num_threads); - std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0); - for (int i=0; i<sent_log_probs[0].size(); i++) { - for (int t=0; t<num_threads; t++) - cout << sent_log_probs[t][i] << "\t"; - cout << endl; - for (int t=0; t<num_threads; t++) - log_likelihood[t] += sent_log_probs[t][i]; - } - - cerr << "Test log10-likelihood: "; + start.push_back(ngrams.size()); + copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams)); + } + start.push_back(ngrams.size()); + + int num_threads = 1; + vector< vector<double> > sent_log_probs(num_threads); + + /* + // Test thread safety + boost::thread_group tg; + for (int t=0; t < num_threads; t++) { + tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm + } + tg.join_all(); + */ + score(lm, minibatch_size, start, ngrams, sent_log_probs[0]); + + vector<double> log_likelihood(num_threads); + std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0); + for (int i=0; i<sent_log_probs[0].size(); i++) { for (int t=0; t<num_threads; t++) - cerr << log_likelihood[t] << " "; - cerr << endl; - #ifdef USE_CHRONO - cerr << "Propagation times:"; - for (int i=0; i<timer.size(); i++) - cerr << " " << timer.get(i); - cerr << endl; - #endif - + cout << sent_log_probs[t][i] << "\t"; + cout << endl; + for (int t=0; t<num_threads; t++) + log_likelihood[t] += sent_log_probs[t][i]; + } + + cerr << "Test log10-likelihood: "; + for (int t=0; t<num_threads; t++) + cerr << log_likelihood[t] << " "; + cerr << endl; +#ifdef USE_CHRONO + cerr << "Propagation times:"; + for (int i=0; i<timer.size(); i++) + cerr << " " << timer.get(i); + cerr << endl; +#endif + } diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp index 63ee27d..d4720ef 100644 --- a/src/trainNeuralNetwork.cpp +++ b/src/trainNeuralNetwork.cpp @@ -6,17 +6,16 @@ #include <vector> #include <algorithm> -#include <boost/unordered_map.hpp> +#include <boost/unordered_map.hpp> #include <boost/functional.hpp> #include <boost/lexical_cast.hpp> #include <boost/random/mersenne_twister.hpp> #include <boost/algorithm/string/join.hpp> -# include <boost/interprocess/managed_shared_memory.hpp> -# include <boost/interprocess/allocators/allocator.hpp> -# include <boost/interprocess/managed_mapped_file.hpp> +#include <boost/interprocess/managed_shared_memory.hpp> +#include <boost/interprocess/allocators/allocator.hpp> +#include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/containers/vector.hpp> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include <Eigen/Sparse> #include "maybe_omp.h" @@ -29,7 +28,6 @@ #include "graphClasses.h" #include "util.h" #include "multinomial.h" -//#include "gradientCheck.h" //#define EIGEN_DONT_PARALLELIZE @@ -65,7 +63,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va int validation_minibatch_start_index = validation_minibatch_size * validation_batch; int current_minibatch_size = min(validation_minibatch_size, validation_data_size - validation_minibatch_start_index); - minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, + minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, current_minibatch_size); prop_validation.fProp(minibatch.topRows(ngram_size-1)); @@ -80,7 +78,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va // And softmax and loss. Be careful of short minibatch double minibatch_log_likelihood; start_timer(5); - SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), minibatch.row(ngram_size-1), output_probs, minibatch_log_likelihood); @@ -93,7 +91,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va // If the validation perplexity decreases, halve the learning rate. if (current_validation_ll != 0.0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA") - { + { current_learning_rate /= 2; } current_validation_ll = log_likelihood; @@ -101,7 +99,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va int main(int argc, char** argv) -{ +{ ios::sync_with_stdio(false); bool use_mmap_file, randomize; param myParam; @@ -183,7 +181,7 @@ int main(int argc, char** argv) myParam.input_words_file = input_words_file.getValue(); myParam.output_words_file = output_words_file.getValue(); if (words_file.getValue() != "") - myParam.input_words_file = myParam.output_words_file = words_file.getValue(); + myParam.input_words_file = myParam.output_words_file = words_file.getValue(); myParam.model_prefix = model_prefix.getValue(); @@ -192,7 +190,7 @@ int main(int argc, char** argv) myParam.input_vocab_size = input_vocab_size.getValue(); myParam.output_vocab_size = output_vocab_size.getValue(); if (vocab_size.getValue() > 0) { - myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); + myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); } myParam.num_hidden = num_hidden.getValue(); myParam.activation_function = activation_function.getValue(); @@ -205,7 +203,7 @@ int main(int argc, char** argv) myParam.input_embedding_dimension = input_embedding_dimension.getValue(); myParam.output_embedding_dimension = output_embedding_dimension.getValue(); if (embedding_dimension.getValue() >= 0) { - myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue(); + myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue(); } myParam.minibatch_size = minibatch_size.getValue(); @@ -243,33 +241,33 @@ int main(int argc, char** argv) if (embedding_dimension.getValue() >= 0) { - cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; + cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; } else { - cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; - cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; + cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; + cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; } cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl; if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue()) { - cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; - exit(1); + cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; + exit(1); } cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl; if (string_to_activation_function(activation_function.getValue()) == InvalidFunction) { - cerr << "error: invalid activation function: " << activation_function.getValue() << endl; - exit(1); + cerr << "error: invalid activation function: " << activation_function.getValue() << endl; + exit(1); } cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl; if (string_to_loss_function(loss_function.getValue()) == InvalidLoss) { - cerr << "error: invalid loss function: " << loss_function.getValue() << endl; - exit(1); + cerr << "error: invalid loss function: " << loss_function.getValue() << endl; + exit(1); } cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl; @@ -279,7 +277,7 @@ int main(int argc, char** argv) cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl; cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; if (myParam.validation_file != "") { - cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; + cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; } cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl; cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl; @@ -288,7 +286,7 @@ int main(int argc, char** argv) cerr << normalization.getDescription() << sep << normalization.getValue() << endl; if (myParam.normalization){ - cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; + cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; } cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl; @@ -302,7 +300,7 @@ int main(int argc, char** argv) if (unigram_probs_file.getValue() != "") { - cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; + cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; } } catch (TCLAP::ArgException &e) @@ -337,7 +335,7 @@ int main(int argc, char** argv) training_data_flat_mmap = mmap_file.find<vec>("vector").first; cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl; training_data_size = training_data_flat_mmap->size()/myParam.ngram_size; - //randomly shuffle the data for better learning. The shuffling will + //randomly shuffle the data for better learning. The shuffling will //be different for a standard stl vector // Randomly shuffle training data to improve learning if (randomize == true) { @@ -413,10 +411,10 @@ int main(int argc, char** argv) //cerr<<"Num tokens "<<num_tokens<<endl; //data_size_t training_data_size = num_tokens / myParam.ngram_size; cerr << "Number of training instances: "<< training_data_size << endl; - + Matrix<int,Dynamic,Dynamic> training_data; //(training_data_flat.data(), myParam.ngram_size, training_data_size); - + #ifdef MAP cerr<<"Setting up eigen map"<<endl; if (use_mmap_file == false) { @@ -425,11 +423,11 @@ int main(int argc, char** argv) training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size); } cerr<<"Created eigen map"<<endl; - #else + #else if (use_mmap_file == false) { training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size); } - #endif + #endif // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index if (myParam.input_vocab_size == 0 and myParam.input_words_file == "") { @@ -454,7 +452,7 @@ int main(int argc, char** argv) // Read validation data vector<int> validation_data_flat; int validation_data_size = 0; - + if (myParam.validation_file != "") { readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat); @@ -470,16 +468,16 @@ int main(int argc, char** argv) if (myParam.input_words_file != "") { readWordsFile(myParam.input_words_file, input_words); - if (myParam.input_vocab_size == 0) - myParam.input_vocab_size = input_words.size(); + if (myParam.input_vocab_size == 0) + myParam.input_vocab_size = input_words.size(); } vector<string> output_words; if (myParam.output_words_file != "") { readWordsFile(myParam.output_words_file, output_words); - if (myParam.output_vocab_size == 0) - myParam.output_vocab_size = output_words.size(); + if (myParam.output_vocab_size == 0) + myParam.output_vocab_size = output_words.size(); } ///// Construct unigram model and sampler that will be used for NCE @@ -491,17 +489,17 @@ int main(int argc, char** argv) if (use_mmap_file == false) { output_word = training_data(myParam.ngram_size-1, train_id); } else { - //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl; + //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl; output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1); } - //cerr<<"output word is "<<output_word<<endl; - unigram_counts[output_word] += 1; + //cerr<<"output word is "<<output_word<<endl; + unigram_counts[output_word] += 1; } multinomial<data_size_t> unigram (unigram_counts); ///// Create and initialize the neural network and associated propagators. model nn; - // IF THE MODEL FILE HAS BEEN DEFINED, THEN + // IF THE MODEL FILE HAS BEEN DEFINED, THEN // LOAD THE NEURAL NETWORK MODEL if (myParam.model_file != ""){ nn.read(myParam.model_file); @@ -529,7 +527,7 @@ int main(int argc, char** argv) SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram); // normalization parameters vector_map c_h, c_h_running_gradient; - + ///////////////////////TRAINING THE NEURAL NETWORK//////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// @@ -540,8 +538,8 @@ int main(int argc, char** argv) if (validation_data_size > 0) { num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1; - cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl; - } + cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl; + } double current_momentum = myParam.initial_momentum; double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1); @@ -568,36 +566,36 @@ int main(int argc, char** argv) } for (int epoch=0; epoch<myParam.num_epochs; epoch++) - { + { cerr << "Epoch " << epoch+1 << endl; cerr << "Current learning rate: " << current_learning_rate << endl; - if (myParam.use_momentum) - cerr << "Current momentum: " << current_momentum << endl; - else + if (myParam.use_momentum) + cerr << "Current momentum: " << current_momentum << endl; + else current_momentum = -1; - cerr << "Training minibatches: "; + cerr << "Training minibatches: "; - double log_likelihood = 0.0; + double log_likelihood = 0.0; - int num_samples = 0; - if (loss_function == LogLoss) - num_samples = output_vocab_size; - else if (loss_function == NCELoss) - num_samples = 1+num_noise_samples; + int num_samples = 0; + if (loss_function == LogLoss) + num_samples = output_vocab_size; + else if (loss_function == NCELoss) + num_samples = 1+num_noise_samples; - Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); - Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size); - Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size); - Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); + Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size); for(data_size_t batch=0;batch<num_batches;batch++) { if (batch > 0 && batch % 10000 == 0) { - cerr << batch <<"..."; - } + cerr << batch <<"..."; + } if (batch > 0 && batch % 500000 == 0) { @@ -605,31 +603,31 @@ int main(int argc, char** argv) compute_validation_perplexity(ngram_size, output_vocab_size, validation_minibatch_size, validation_data_size, num_validation_batches, myParam, prop_validation, validation_data, current_learning_rate, current_validation_ll); cerr << "Current learning rate: " << current_learning_rate << endl; } - + data_size_t minibatch_start_index = minibatch_size * batch; int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index); #ifdef MAP - Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); - #else + Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); + #else //ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file - Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size); - //cerr<<"Minibatch start index "<<minibatch_start_index<<endl; - //cerr<<"Minibatch size "<<current_minibatch_size<<endl; + Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size); + //cerr<<"Minibatch start index "<<minibatch_start_index<<endl; + //cerr<<"Minibatch size "<<current_minibatch_size<<endl; if (use_mmap_file == true) { minibatch.setZero(ngram_size,current_minibatch_size); //now reading the ngrams from the mmaped file for (int k=0; k<ngram_size; k++){ for (data_size_t index = 0 ; index<current_minibatch_size; index++) { - data_size_t current_index = index + minibatch_start_index; - //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl; + data_size_t current_index = index + minibatch_start_index; + //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl; minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k); } } } else { minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); } - #endif + #endif double adjusted_learning_rate = current_learning_rate/minibatch_size; //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl; @@ -648,20 +646,20 @@ int main(int argc, char** argv) prop.fProp(minibatch.topRows(ngram_size-1)); - if (loss_function == NCELoss) - { - ///// Noise-contrastive estimation + if (loss_function == NCELoss) + { + ///// Noise-contrastive estimation - // Generate noise samples. Gather positive and negative samples into matrix. + // Generate noise samples. Gather positive and negative samples into matrix. - start_timer(3); + start_timer(3); minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1); - + for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++) for (int train_id = 0; train_id < current_minibatch_size; train_id++) minibatch_samples(sample_id, train_id) = unigram.sample(rng); - + stop_timer(3); // Final forward propagation step (sparse) @@ -686,7 +684,7 @@ int main(int argc, char** argv) double minibatch_log_likelihood; start_timer(5); - softmax_loss.fProp(scores.leftCols(current_minibatch_size), + softmax_loss.fProp(scores.leftCols(current_minibatch_size), minibatch_samples, probs, minibatch_log_likelihood); stop_timer(5); @@ -697,9 +695,9 @@ int main(int argc, char** argv) start_timer(6); softmax_loss.bProp(probs, minibatch_weights); stop_timer(6); - + // Update the normalization parameters - + if (myParam.normalization) { for (int train_id = 0;train_id < current_minibatch_size;train_id++) @@ -711,19 +709,19 @@ int main(int argc, char** argv) // Be careful of short minibatch prop.bProp(minibatch.topRows(ngram_size-1), - minibatch_samples.leftCols(current_minibatch_size), + minibatch_samples.leftCols(current_minibatch_size), minibatch_weights.leftCols(current_minibatch_size), - adjusted_learning_rate, + adjusted_learning_rate, current_momentum, myParam.L2_reg, myParam.parameter_update, myParam.conditioning_constant, myParam.decay); - } - else if (loss_function == LogLoss) - { - ///// Standard log-likelihood - start_timer(4); + } + else if (loss_function == LogLoss) + { + ///// Standard log-likelihood + start_timer(4); if (prop.skip_hidden) prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); else @@ -732,21 +730,21 @@ int main(int argc, char** argv) double minibatch_log_likelihood; start_timer(5); - SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), - minibatch.row(ngram_size-1), - probs, + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + minibatch.row(ngram_size-1), + probs, minibatch_log_likelihood); stop_timer(5); log_likelihood += minibatch_log_likelihood; ///// Backward propagation - + start_timer(6); - SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), - probs.leftCols(current_minibatch_size), + SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), + probs.leftCols(current_minibatch_size), minibatch_weights); stop_timer(6); - + prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size), minibatch_weights, adjusted_learning_rate, @@ -757,33 +755,33 @@ int main(int argc, char** argv) myParam.decay); } } - cerr << "done." << endl; + cerr << "done." << endl; - if (loss_function == LogLoss) - { - cerr << "Training log-likelihood: " << log_likelihood << endl; + if (loss_function == LogLoss) + { + cerr << "Training log-likelihood: " << log_likelihood << endl; cerr << " perplexity: "<< exp(-log_likelihood/training_data_size) << endl; - } - else if (loss_function == NCELoss) - cerr << "Training NCE log-likelihood: " << log_likelihood << endl; + } + else if (loss_function == NCELoss) + cerr << "Training NCE log-likelihood: " << log_likelihood << endl; current_momentum += momentum_delta; - #ifdef USE_CHRONO - cerr << "Propagation times:"; - for (int i=0; i<timer.size(); i++) - cerr << " " << timer.get(i); - cerr << endl; - #endif - - if (myParam.model_prefix != "") - { - cerr << "Writing model" << endl; - if (myParam.input_words_file != "") - nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words); - else - nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1)); - } + #ifdef USE_CHRONO + cerr << "Propagation times:"; + for (int i=0; i<timer.size(); i++) + cerr << " " << timer.get(i); + cerr << endl; + #endif + + if (myParam.model_prefix != "") + { + cerr << "Writing model" << endl; + if (myParam.input_words_file != "") + nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words); + else + nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1)); + } if (epoch % 1 == 0 && validation_data_size > 0) { @@ -793,4 +791,3 @@ int main(int argc, char** argv) } return 0; } - @@ -15,7 +15,6 @@ #include <boost/chrono.hpp> #endif -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "maybe_omp.h" @@ -23,15 +22,15 @@ // Make matrices hashable namespace Eigen { - template <typename Derived> - size_t hash_value(const DenseBase<Derived> &m) - { - size_t h=0; - for (int i=0; i<m.rows(); i++) - for (int j=0; j<m.cols(); j++) - boost::hash_combine(h, m(i,j)); - return h; - } +template <typename Derived> +size_t hash_value(const DenseBase<Derived> &m) +{ + size_t h=0; + for (int i=0; i<m.rows(); i++) + for (int j=0; j<m.cols(); j++) + boost::hash_combine(h, m(i,j)); + return h; +} } namespace nplm @@ -73,9 +72,9 @@ void readSentFile(const std::string &file, T &sentences) } inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){ - int ngram_size = ngram.size(); - for (int i=0;i<ngram_size;i++) - int_ngram.push_back(boost::lexical_cast<int>(ngram[i])); + int ngram_size = ngram.size(); + for (int i=0;i<ngram_size;i++) + int_ngram.push_back(boost::lexical_cast<int>(ngram[i])); } // Functions that take non-const matrices as arguments @@ -85,186 +84,186 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra template <typename Derived> void initMatrix(boost::random::mt19937 &engine, - const Eigen::MatrixBase<Derived> &p_const, - bool init_normal, double range) + const Eigen::MatrixBase<Derived> &p_const, + bool init_normal, double range) { - UNCONST(Derived, p_const, p); - if (init_normal == 0) - // initialize with uniform distribution in [-range, range] + UNCONST(Derived, p_const, p); + if (init_normal == 0) + // initialize with uniform distribution in [-range, range] + { + boost::random::uniform_real_distribution<> unif_real(-range, range); + for (int i = 0; i < p.rows(); i++) { - boost::random::uniform_real_distribution<> unif_real(-range, range); - for (int i = 0; i < p.rows(); i++) - { - for (int j = 0; j< p.cols(); j++) - { - p(i,j) = unif_real(engine); - } - } - + for (int j = 0; j< p.cols(); j++) + { + p(i,j) = unif_real(engine); + } } - else - // initialize with gaussian distribution with mean 0 and stdev range + + } + else + // initialize with gaussian distribution with mean 0 and stdev range + { + boost::random::normal_distribution<double> unif_normal(0., range); + for (int i = 0; i < p.rows(); i++) { - boost::random::normal_distribution<double> unif_normal(0., range); - for (int i = 0; i < p.rows(); i++) - { - for (int j = 0; j < p.cols(); j++) - { - p(i,j) = unif_normal(engine); - } - } + for (int j = 0; j < p.cols(); j++) + { + p(i,j) = unif_normal(engine); + } } + } } template <typename Derived> void initBias(boost::random::mt19937 &engine, - const Eigen::MatrixBase<Derived> &p_const, - bool init_normal, double range) + const Eigen::MatrixBase<Derived> &p_const, + bool init_normal, double range) { - UNCONST(Derived, p_const, p); - if (init_normal == 0) - // initialize with uniform distribution in [-range, range] + UNCONST(Derived, p_const, p); + if (init_normal == 0) + // initialize with uniform distribution in [-range, range] + { + boost::random::uniform_real_distribution<> unif_real(-range, range); + for (int i = 0; i < p.size(); i++) { - boost::random::uniform_real_distribution<> unif_real(-range, range); - for (int i = 0; i < p.size(); i++) - { - p(i) = unif_real(engine); - } - + p(i) = unif_real(engine); } - else - // initialize with gaussian distribution with mean 0 and stdev range + + } + else + // initialize with gaussian distribution with mean 0 and stdev range + { + boost::random::normal_distribution<double> unif_normal(0., range); + for (int i = 0; i < p.size(); i++) { - boost::random::normal_distribution<double> unif_normal(0., range); - for (int i = 0; i < p.size(); i++) - { - p(i) = unif_normal(engine); - } + p(i) = unif_normal(engine); } + } } template <typename Derived> void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> ¶m_const) { - UNCONST(Derived, param_const, param); + UNCONST(Derived, param_const, param); + + int i = 0; + std::string line; + std::vector<std::string> fields; + + while (std::getline(TRAININ, line) && line != "") + { + splitBySpace(line, fields); + if (fields.size() != param.cols()) + { + std::ostringstream err; + err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")"; + throw std::runtime_error(err.str()); + } - int i = 0; - std::string line; - std::vector<std::string> fields; - - while (std::getline(TRAININ, line) && line != "") + if (i >= param.rows()) { - splitBySpace(line, fields); - if (fields.size() != param.cols()) - { - std::ostringstream err; - err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")"; - throw std::runtime_error(err.str()); - } - - if (i >= param.rows()) - { - std::ostringstream err; - err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")"; - throw std::runtime_error(err.str()); - } - - for (int j=0; j<fields.size(); j++) - { - param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]); - } - i++; + std::ostringstream err; + err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")"; + throw std::runtime_error(err.str()); } - - if (i != param.rows()) + + for (int j=0; j<fields.size(); j++) { - std::ostringstream err; - err << "error: wrong number of rows (expected " << param.rows() << ", found more)"; - throw std::runtime_error(err.str()); + param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]); } + i++; + } + + if (i != param.rows()) + { + std::ostringstream err; + err << "error: wrong number of rows (expected " << param.rows() << ", found more)"; + throw std::runtime_error(err.str()); + } } template <typename Derived> void readMatrix(const std::string ¶m_file, const Eigen::MatrixBase<Derived> ¶m_const) { - UNCONST(Derived, param_const, param); - std::cerr << "Reading data from file: " << param_file << std::endl; - - std::ifstream TRAININ(param_file.c_str()); - if (!TRAININ) - { - std::cerr << "Error: can't read training data from file " << param_file << std::endl; - exit(-1); - } - readMatrix(TRAININ, param); - TRAININ.close(); + UNCONST(Derived, param_const, param); + std::cerr << "Reading data from file: " << param_file << std::endl; + + std::ifstream TRAININ(param_file.c_str()); + if (!TRAININ) + { + std::cerr << "Error: can't read training data from file " << param_file << std::endl; + exit(-1); + } + readMatrix(TRAININ, param); + TRAININ.close(); } template <typename Derived> void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, const std::string &filename) { - std::cerr << "Writing parameters to " << filename << std::endl; + std::cerr << "Writing parameters to " << filename << std::endl; - std::ofstream OUT; - OUT.precision(16); - OUT.open(filename.c_str()); - if (! OUT) - { - std::cerr << "Error: can't write to file " << filename<< std::endl; - exit(-1); - } - writeMatrix(param, OUT); - OUT.close(); + std::ofstream OUT; + OUT.precision(16); + OUT.open(filename.c_str()); + if (! OUT) + { + std::cerr << "Error: can't write to file " << filename<< std::endl; + exit(-1); + } + writeMatrix(param, OUT); + OUT.close(); } template <typename Derived> void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, std::ofstream &OUT) { - for (int row = 0;row < param.rows();row++) + for (int row = 0;row < param.rows();row++) + { + int col; + for (col = 0;col < param.cols()-1;col++) { - int col; - for (col = 0;col < param.cols()-1;col++) - { - OUT<<param(row,col)<<"\t"; - } - //dont want an extra tab at the end - OUT<<param(row,col)<<std::endl; + OUT<<param(row,col)<<"\t"; } + //dont want an extra tab at the end + OUT<<param(row,col)<<std::endl; + } } template <typename Derived> double logsum(const Eigen::MatrixBase<Derived> &v) { - int mi; - double m = v.maxCoeff(&mi); - double logz = 0.0; - for (int i=0; i<v.rows(); i++) - if (i != mi) - logz += std::exp(v(i) - m); - logz = log1p(logz) + m; - return logz; + int mi; + double m = v.maxCoeff(&mi); + double logz = 0.0; + for (int i=0; i<v.rows(); i++) + if (i != mi) + logz += std::exp(v(i) - m); + logz = log1p(logz) + m; + return logz; } double logadd(double x, double y); #ifdef USE_CHRONO -class Timer +class Timer { - typedef boost::chrono::high_resolution_clock clock_type; - typedef clock_type::time_point time_type; - typedef clock_type::duration duration_type; - std::vector<time_type> m_start; - std::vector<duration_type> m_total; -public: - Timer() { } - Timer(int n) { resize(n); } - void resize(int n) { m_start.resize(n); m_total.resize(n); } - int size() const { return m_start.size(); } - void start(int i); - void stop(int i); - void reset(int i); - double get(int i) const; + typedef boost::chrono::high_resolution_clock clock_type; + typedef clock_type::time_point time_type; + typedef clock_type::duration duration_type; + std::vector<time_type> m_start; + std::vector<duration_type> m_total; + public: + Timer() { } + Timer(int n) { resize(n); } + void resize(int n) { m_start.resize(n); m_total.resize(n); } + int size() const { return m_start.size(); } + void start(int i); + void stop(int i); + void reset(int i); + double get(int i) const; }; extern Timer timer; |