diff options
author | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-07-17 21:39:42 +0300 |
---|---|---|
committer | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-07-17 21:39:42 +0300 |
commit | a7da1b618082964152054b00c142e5962e4ca692 (patch) | |
tree | 45872fc848d3729e8632af0ffdc431726e39e7a2 | |
parent | 28bdadf328c63ee086e8aa5de23cfe0c11728c5b (diff) | |
parent | c461c4ad7232274dab8405b736bb1ac55cc7874d (diff) |
Merge pull request #5 from graehl/master
c++11
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | src/Activation_function.h | 129 | ||||
-rw-r--r-- | src/Makefile | 4 | ||||
-rw-r--r-- | src/SoftmaxLoss.h | 159 | ||||
-rw-r--r-- | src/USCMatrix.h | 227 | ||||
-rw-r--r-- | src/find_string.hpp | 89 | ||||
-rw-r--r-- | src/graphClasses.h | 89 | ||||
-rw-r--r-- | src/model.cpp | 482 | ||||
-rw-r--r-- | src/neuralClasses.h | 1794 | ||||
-rw-r--r-- | src/neuralLM.h | 213 | ||||
-rw-r--r-- | src/neuralNetwork.h | 319 | ||||
-rw-r--r-- | src/neuralTM.h | 222 | ||||
-rw-r--r-- | src/prepareNeuralLM.cpp | 1057 | ||||
-rw-r--r-- | src/propagator.h | 641 | ||||
-rw-r--r-- | src/replace_digits.hpp | 62 | ||||
-rw-r--r-- | src/testNeuralLM.cpp | 279 | ||||
-rw-r--r-- | src/trainNeuralNetwork.cpp | 229 | ||||
-rw-r--r-- | src/types.hpp | 41 | ||||
-rw-r--r-- | src/util.h | 281 | ||||
-rw-r--r-- | src/vocabulary.h | 130 |
20 files changed, 3304 insertions, 3145 deletions
@@ -8,3 +8,5 @@ src/prepareNeuralTM src/testNeuralLM src/testNeuralNetwork src/trainNeuralNetwork +.history +src/make.sh diff --git a/src/Activation_function.h b/src/Activation_function.h index 66342bb..742c2fc 100644 --- a/src/Activation_function.h +++ b/src/Activation_function.h @@ -3,7 +3,6 @@ #include <cmath> #include <string> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "util.h" @@ -19,28 +18,28 @@ enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunc inline activation_function_type string_to_activation_function (const std::string &s) { - if (s == "identity") - return Identity; - else if (s == "rectifier") - return Rectifier; - else if (s == "tanh") - return Tanh; - else if (s == "hardtanh") - return HardTanh; - else - return InvalidFunction; + if (s == "identity") + return Identity; + else if (s == "rectifier") + return Rectifier; + else if (s == "tanh") + return Tanh; + else if (s == "hardtanh") + return HardTanh; + else + return InvalidFunction; } inline std::string activation_function_to_string (activation_function_type f) { - if (f == Identity) - return "identity"; - else if (f == Rectifier) - return "rectifier"; - else if (f == Tanh) - return "tanh"; - else if (f == HardTanh) - return "hardtanh"; + if (f == Identity) + return "identity"; + else if (f == Rectifier) + return "rectifier"; + else if (f == Tanh) + return "tanh"; + else if (f == HardTanh) + return "hardtanh"; } struct hardtanh_functor { @@ -69,51 +68,53 @@ struct drectifier_functor { class Activation_function { - int size; - activation_function_type f; - - public: - Activation_function() : size(0), f(Rectifier) { } - - void resize(int size) { this->size = size; } - void set_activation_function(activation_function_type f) { this->f = f; } - - template <typename Engine> - void initialize(Engine &engine, bool init_normal, double init_range) { } - - int n_inputs () const { return size; } - int n_outputs () const { return size; } - - template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const - { - UNCONST(DerivedOut, output, my_output); - - switch (f) - { - case Identity: my_output = input; break; - case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; - case Tanh: my_output = input.unaryExpr(tanh_functor()); break; - case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; - } - } - - template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut> - void bProp(const MatrixBase<DerivedGOut> &input, - MatrixBase<DerivedGIn> &output, - const MatrixBase<DerivedIn> &finput, - const MatrixBase<DerivedOut> &foutput) const - { - UNCONST(DerivedGIn, output, my_output); - - switch (f) - { - case Identity: my_output = input; break; - case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; - case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; - case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; - } - } + int size; + activation_function_type f; + + public: + Activation_function() : size(0), f(Rectifier) { } + + void resize(int size) { this->size = size; } + void set_activation_function(activation_function_type f) { this->f = f; } + + template <typename Engine> + void initialize(Engine &engine, bool init_normal, double init_range) { } + + int n_inputs () const { return size; } + int n_outputs () const { return size; } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break; + case Tanh: my_output = input.unaryExpr(tanh_functor()); break; + case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break; + case InvalidFunction: std::abort(); + } + } + + template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut> + void bProp(const MatrixBase<DerivedGOut> &input, + MatrixBase<DerivedGIn> &output, + const MatrixBase<DerivedIn> &finput, + const MatrixBase<DerivedOut> &foutput) const + { + UNCONST(DerivedGIn, output, my_output); + + switch (f) + { + case Identity: my_output = input; break; + case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break; + case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break; + case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break; + case InvalidFunction: std::abort(); + } + } }; } // namespace nplm diff --git a/src/Makefile b/src/Makefile index 1611ccb..2a27405 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,12 +1,12 @@ ### Compilation options. # C++ compiler. Tested with g++ and Intel icpc. -CXX=/usr/bin/g++ +CXX=g++ #CXX=icpc # Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance! #CFLAGS=-g -CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG +CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG $(CXXFLAGS) # Architecture. Set to x86_64 or i686 to override. ARCH:=$(shell uname -m) diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h index bc55762..d89cde6 100644 --- a/src/SoftmaxLoss.h +++ b/src/SoftmaxLoss.h @@ -1,7 +1,6 @@ - #ifndef SOFTMAXLOSS_H +#ifndef SOFTMAXLOSS_H #define SOFTMAXLOSS_H -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "multinomial.h" #include "util.h" @@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss }; inline loss_function_type string_to_loss_function (const std::string &s) { - if (s == "log") - return LogLoss; - else if (s == "nce") - return NCELoss; - else - return InvalidLoss; + if (s == "log") + return LogLoss; + else if (s == "nce") + return NCELoss; + else + return InvalidLoss; } inline std::string loss_function_to_string (loss_function_type f) { - if (f == LogLoss) - return "log"; - else if (f == NCELoss) - return "nce"; + if (f == LogLoss) + return "log"; + else if (f == NCELoss) + return "nce"; } /// Note: Outputs log-probabilities. struct SoftmaxLogLoss { - template <typename DerivedI, typename DerivedW, typename DerivedO> - void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss) + template <typename DerivedI, typename DerivedW, typename DerivedO> + void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + + double log_likelihood = 0.0; + +#pragma omp parallel for reduction(+:log_likelihood) + for (int train_id = 0; train_id < input.cols(); train_id++) { - UNCONST(DerivedO, output_const, output); - - double log_likelihood = 0.0; - - #pragma omp parallel for reduction(+:log_likelihood) - for (int train_id = 0; train_id < input.cols(); train_id++) - { - double normalization = logsum(input.col(train_id)); - output.col(train_id).array() = input.col(train_id).array() - normalization; - log_likelihood += output(output_words(train_id), train_id); - } - loss = log_likelihood; + double normalization = logsum(input.col(train_id)); + output.col(train_id).array() = input.col(train_id).array() - normalization; + log_likelihood += output(output_words(train_id), train_id); } - - template <typename DerivedW, typename DerivedO, typename DerivedI> - void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const) + loss = log_likelihood; + } + + template <typename DerivedW, typename DerivedO, typename DerivedI> + void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const) + { + UNCONST(DerivedI, grad_input_const, grad_input); + grad_input.setZero(); +#pragma omp parallel for + for (int train_id = 0; train_id < output.cols(); train_id++) { - UNCONST(DerivedI, grad_input_const, grad_input); - grad_input.setZero(); - #pragma omp parallel for - for (int train_id = 0; train_id < output.cols(); train_id++) - { - grad_input(output_words(train_id), train_id) += 1.; - grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); - } + grad_input(output_words(train_id), train_id) += 1.; + grad_input.col(train_id) -= output.col(train_id).array().exp().matrix(); } + } }; ///// Softmax layer plus NCE loss function. @@ -81,55 +80,55 @@ struct SoftmaxLogLoss template <typename Multinomial> class SoftmaxNCELoss { - const Multinomial &unigram; + const Multinomial &unigram; -public: - SoftmaxNCELoss(const Multinomial &unigram) + public: + SoftmaxNCELoss(const Multinomial &unigram) : unigram(unigram) + { + } + + template <typename DerivedI, typename DerivedW, typename DerivedO> + void fProp(const MatrixBase<DerivedI> &scores, + const MatrixBase<DerivedW> &minibatch_samples, + const MatrixBase<DerivedO> &output_const, double &loss) + { + UNCONST(DerivedO, output_const, output); + double log_likelihood = 0.0; + int num_noise_samples = minibatch_samples.rows()-1; + double log_num_noise_samples = std::log(num_noise_samples); +#pragma omp parallel for reduction(+:log_likelihood) schedule(static) + for (int train_id = 0; train_id < scores.cols(); train_id++) { + for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) + { + int sample = minibatch_samples(sample_id, train_id); + // To avoid zero or infinite probabilities, + // never take exp of score without normalizing first, + // even if it's a little slower... + double score = scores(sample_id, train_id); + double score_noise = log_num_noise_samples + unigram.logprob(sample); + double z = logadd(score, score_noise); + double logprob = score - z; + double logprob_noise = score_noise - z; + output(sample_id, train_id) = std::exp(logprob); + log_likelihood += sample_id == 0 ? logprob : logprob_noise; + } } - - template <typename DerivedI, typename DerivedW, typename DerivedO> - void fProp(const MatrixBase<DerivedI> &scores, - const MatrixBase<DerivedW> &minibatch_samples, - const MatrixBase<DerivedO> &output_const, double &loss) - { - UNCONST(DerivedO, output_const, output); - double log_likelihood = 0.0; - int num_noise_samples = minibatch_samples.rows()-1; - double log_num_noise_samples = std::log(num_noise_samples); - #pragma omp parallel for reduction(+:log_likelihood) schedule(static) - for (int train_id = 0; train_id < scores.cols(); train_id++) - { - for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++) - { - int sample = minibatch_samples(sample_id, train_id); - // To avoid zero or infinite probabilities, - // never take exp of score without normalizing first, - // even if it's a little slower... - double score = scores(sample_id, train_id); - double score_noise = log_num_noise_samples + unigram.logprob(sample); - double z = logadd(score, score_noise); - double logprob = score - z; - double logprob_noise = score_noise - z; - output(sample_id, train_id) = std::exp(logprob); - log_likelihood += sample_id == 0 ? logprob : logprob_noise; - } - } - loss = log_likelihood; - } - - template <typename DerivedO, typename DerivedI> - void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const) + loss = log_likelihood; + } + + template <typename DerivedO, typename DerivedI> + void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const) + { + UNCONST(DerivedI, output_const, output); +#pragma omp parallel for schedule(static) + for (int train_id = 0; train_id < probs.cols(); train_id++) { - UNCONST(DerivedI, output_const, output); - #pragma omp parallel for schedule(static) - for (int train_id = 0; train_id < probs.cols(); train_id++) - { - output.col(train_id) = -probs.col(train_id); - output(0, train_id) += 1.0; - } + output.col(train_id) = -probs.col(train_id); + output(0, train_id) += 1.0; } + } }; } // namespace nplm diff --git a/src/USCMatrix.h b/src/USCMatrix.h index 02aeb33..784fa1b 100644 --- a/src/USCMatrix.h +++ b/src/USCMatrix.h @@ -1,7 +1,6 @@ #ifndef USCMATRIX_H #define USCMATRIX_H -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "maybe_omp.h" #include "util.h" @@ -34,108 +33,108 @@ template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_ class USCMatrix { -public: - Matrix<Index,Dynamic,Dynamic> indexes; - Matrix<Scalar,Dynamic,Dynamic> values; - int m_rows; + public: + Matrix<Index,Dynamic,Dynamic> indexes; + Matrix<Scalar,Dynamic,Dynamic> values; + int m_rows; - USCMatrix() : m_rows(0) { } + USCMatrix() : m_rows(0) { } - template <typename Indexes, typename Values> - USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) - : - indexes(indexes), - values(values), - m_rows(rows) - { } + template <typename Indexes, typename Values> + USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values) + : + indexes(indexes), + values(values), + m_rows(rows) + { } - USCMatrix(Index rows, Index nnz, Index cols) - : - indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), + USCMatrix(Index rows, Index nnz, Index cols) + : + indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)), values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)), m_rows(rows) - { - this->indexes.fill(-1); - } - - Index rows() const { return m_rows; } - Index cols() const { return indexes.cols(); } - - void resize(Index rows, Index nnz, Index cols) { - indexes.resize(nnz, cols); - values.resize(nnz, cols); - m_rows = rows; - } + { + this->indexes.fill(-1); + } + + Index rows() const { return m_rows; } + Index cols() const { return indexes.cols(); } + + void resize(Index rows, Index nnz, Index cols) { + indexes.resize(nnz, cols); + values.resize(nnz, cols); + m_rows = rows; + } }; // Dense matrix - sparse matrix product // a is presumably very wide template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC> -void uscgemm(double alpha, const MatrixBase<DerivedA> &a, - const USCMatrix<ScalarB,Index> &b, - const MatrixBase<DerivedC> &c_const) +void uscgemm(double alpha, const MatrixBase<DerivedA> &a, + const USCMatrix<ScalarB,Index> &b, + const MatrixBase<DerivedC> &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - #pragma omp parallel for - for (Index k=0; k<b.cols(); k++) - for (Index r=0; r<b.indexes.rows(); r++) - { - Index j = b.indexes(r,k); - eigen_assert(j >= 0); - eigen_assert(j < a.cols()); - c.col(k) += alpha * a.col(j) * b.values(r,k); - } + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + +#pragma omp parallel for + for (Index k=0; k<b.cols(); k++) + for (Index r=0; r<b.indexes.rows(); r++) + { + Index j = b.indexes(r,k); + eigen_assert(j >= 0); + eigen_assert(j < a.cols()); + c.col(k) += alpha * a.col(j) * b.values(r,k); + } } // sparse matrix - dense matrix product template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> -void uscgemm(double alpha, - const USCMatrix<ScalarA,Index> &a, - const MatrixBase<DerivedB> &b, - const MatrixBase<DerivedC> &c_const) +void uscgemm(double alpha, + const USCMatrix<ScalarA,Index> &a, + const MatrixBase<DerivedB> &b, + const MatrixBase<DerivedC> &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - // This needs to be tuned for each system, unfortunately, - // and seems to vary a lot. A lot. - int i_blocks = omp_get_num_threads()*16; - - // Assume only one block in k direction. - // We don't need to explicitly block in the j direction. - #pragma omp parallel for - for (Index ib=0; ib<i_blocks; ib++) - for (Index j=0; j<a.cols(); j++) - for (Index r=0; r<a.indexes.rows(); r++) - { - Index i = a.indexes(r,j); - eigen_assert(i >= 0); - eigen_assert(i < c.rows()); - if (i % i_blocks == ib) - c.row(i) += alpha * a.values(r,j) * b.row(j); - } - - /* + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + + // This needs to be tuned for each system, unfortunately, + // and seems to vary a lot. A lot. + int i_blocks = omp_get_num_threads()*16; + + // Assume only one block in k direction. + // We don't need to explicitly block in the j direction. +#pragma omp parallel for + for (Index ib=0; ib<i_blocks; ib++) + for (Index j=0; j<a.cols(); j++) + for (Index r=0; r<a.indexes.rows(); r++) + { + Index i = a.indexes(r,j); + eigen_assert(i >= 0); + eigen_assert(i < c.rows()); + if (i % i_blocks == ib) + c.row(i) += alpha * a.values(r,j) * b.row(j); + } + + /* If c.cols() is really large, then theoretically it seems like we should do: parallel for blocks in i direction - for blocks in j direction - pack block of a into smaller sparse matrix - for blocks in k direction - for k - for i (sparse) - for j - c(i,k) += a(i,j) * b(j,k) + for blocks in j direction + pack block of a into smaller sparse matrix + for blocks in k direction + for k + for i (sparse) + for j + c(i,k) += a(i,j) * b(j,k) However, the copying of blocks of a doesn't seem practical for any realistic sizes of c.cols(). - */ + */ } // Dense matrix - dense matrix product, but masked by a sparse matrix, @@ -147,45 +146,45 @@ void uscgemm(double alpha, template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index> void uscgemm_masked(double alpha, - const MatrixBase<DerivedA> &a, - const MatrixBase<DerivedB> &b, - USCMatrix<ScalarC,Index> &c) + const MatrixBase<DerivedA> &a, + const MatrixBase<DerivedB> &b, + USCMatrix<ScalarC,Index> &c) { - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == c.cols()); - - #pragma omp parallel for - for (Index k=0; k<b.cols(); k++) - for (Index r=0; r<c.indexes.rows(); r++) - { - Index i = c.indexes(r, k); - eigen_assert(i >= 0); - eigen_assert(i < a.rows()); - c.values(r, k) += alpha * a.row(i) * b.col(k); - } + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == c.cols()); + +#pragma omp parallel for + for (Index k=0; k<b.cols(); k++) + for (Index r=0; r<c.indexes.rows(); r++) + { + Index i = c.indexes(r, k); + eigen_assert(i >= 0); + eigen_assert(i < a.rows()); + c.values(r, k) += alpha * a.row(i) * b.col(k); + } } // sparse matrix - dense vector product template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> -void uscgemv(double alpha, - const USCMatrix<ScalarA,Index> &a, - const MatrixBase<DerivedB> &b, - const MatrixBase<DerivedC> &c_const) +void uscgemv(double alpha, + const USCMatrix<ScalarA,Index> &a, + const MatrixBase<DerivedB> &b, + const MatrixBase<DerivedC> &c_const) { - UNCONST(DerivedC, c_const, c); - eigen_assert(a.rows() == c.rows()); - eigen_assert(a.cols() == b.rows()); - eigen_assert(b.cols() == 1 && c.cols() == 1); - - for (Index j=0; j<a.cols(); j++) - for (Index r=0; r<a.indexes.rows(); r++) - { - Index i = a.indexes(r,j); - eigen_assert(i >= 0); - eigen_assert(i < c.rows()); - c(i) += alpha * a.values(r,j) * b(j); - } + UNCONST(DerivedC, c_const, c); + eigen_assert(a.rows() == c.rows()); + eigen_assert(a.cols() == b.rows()); + eigen_assert(b.cols() == 1 && c.cols() == 1); + + for (Index j=0; j<a.cols(); j++) + for (Index r=0; r<a.indexes.rows(); r++) + { + Index i = a.indexes(r,j); + eigen_assert(i >= 0); + eigen_assert(i < c.rows()); + c(i) += alpha * a.values(r,j) * b(j); + } } } diff --git a/src/find_string.hpp b/src/find_string.hpp new file mode 100644 index 0000000..d26f6fe --- /dev/null +++ b/src/find_string.hpp @@ -0,0 +1,89 @@ +/** \file \author Jonathan Graehl <graehl@gmail.com> + + find_string(boost::unordered_map<std::string, ...>, pair<char const*, char + const*>) pair is [begin, end), a key: map.find(std:string(key.first, + key.second)) read-only since unordered_map doesn't support lazy construction + of string from a pair key. + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. +*/ + +#ifndef FIND_STRING_GRAEHL_2015_06_24_HPP +#define FIND_STRING_GRAEHL_2015_06_24_HPP +#pragma once + +#include <utility> +#include <algorithm> +#include <cstddef> +#include <boost/functional/hash.hpp> + +namespace std { +/// we do not change standard semantics of any supported comparison e.g. pair vs +/// pair, but simply allow string to be compared against pair of char pointers. +inline bool operator==(std::string const& str, std::pair<char const*, char const*> slice) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +inline bool operator==(std::pair<char const*, char const*> slice, std::string const& str) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +inline bool operator==(std::string const& str, std::pair<char*, char*> slice) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +inline bool operator==(std::pair<char*, char*> slice, std::string const& str) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +/// techinically not allowed but easiest route to ADL. we could rename these instead. +inline std::size_t hash_value(std::pair<char const*, char const*> slice) { + return boost::hash_range(slice.first, slice.second); +} +inline std::size_t hash_value(std::pair<char*, char*> slice) { + return boost::hash_range(slice.first, slice.second); +} +inline std::size_t hash_value(std::string const& str) { + return boost::hash_range(str.begin(), str.end()); +} +} + +struct slice_or_string_eq { + typedef bool result_type; + template <class A, class B> + bool operator()(A const& a, B const& b) const { + return a == b; + } +}; + +struct slice_or_string_hash { + typedef std::size_t result_type; + template <class Slice> + std::size_t operator()(Slice const& slice) const { + return hash_value(slice); + } +}; + +/// \return map.find(std:string(key.first, key.second)) but faster +template <class UnorderedMap, class Slice> +typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, Slice const& key) { + return map.find(key, slice_or_string_hash(), slice_or_string_eq()); +} + +/// \return map.find(std:string(key.first, key.second)) but faster +template <class UnorderedMap, class Slice> +typename UnorderedMap::iterator find_string(UnorderedMap& map, Slice const& key) { + return map.find(key, slice_or_string_hash(), slice_or_string_eq()); +} + + +template <class UnorderedMap, class Slice> +typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, char const* key) { + return map.find(std::pair<char const*, char const*>(key, key+std::strlen(key)), slice_or_string_hash(), slice_or_string_eq()); +} + +/// \return map.find(std:string(key.first, key.second)) but faster +template <class UnorderedMap, class Slice> +typename UnorderedMap::iterator find_string(UnorderedMap& map, char const* key) { + return map.find(std::pair<char const*, char const*>(key, key+std::strlen(key)), slice_or_string_hash(), slice_or_string_eq()); +} + +#endif diff --git a/src/graphClasses.h b/src/graphClasses.h index d3c0c4a..cd80a4c 100644 --- a/src/graphClasses.h +++ b/src/graphClasses.h @@ -3,7 +3,6 @@ #include <cstdlib> #include "neuralClasses.h" -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> namespace nplm @@ -11,50 +10,50 @@ namespace nplm template <class X> class Node { - public: - X * param; //what parameter is this - //vector <void *> children; - //vector <void *> parents; - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; - int minibatch_size; - - public: - Node() : param(NULL), minibatch_size(0) { } - - Node(X *input_param, int minibatch_size) - : param(input_param), - minibatch_size(minibatch_size) - { - resize(minibatch_size); - } - - void resize(int minibatch_size) - { - this->minibatch_size = minibatch_size; - if (param->n_outputs() != -1) - { - fProp_matrix.setZero(param->n_outputs(), minibatch_size); - } - if (param->n_inputs() != -1) - { - bProp_matrix.setZero(param->n_inputs(), minibatch_size); - } - } - - void resize() { resize(minibatch_size); } - - /* - void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols) - { - param->fProp(input,fProp_matrix,0,0,n_cols); - } - void Fprop(Matrix<double,1,Dynamic> & input,int n_cols) - { - param->fProp(input,fProp_matrix,0,0,n_cols); - } - */ - //for f prop, just call the fProp node of the particular parameter. + public: + X * param; //what parameter is this + //vector <void *> children; + //vector <void *> parents; + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; + int minibatch_size; + + public: + Node() : param(NULL), minibatch_size(0) { } + + Node(X *input_param, int minibatch_size) + : param(input_param), + minibatch_size(minibatch_size) + { + resize(minibatch_size); + } + + void resize(int minibatch_size) + { + this->minibatch_size = minibatch_size; + if (param->n_outputs() != -1) + { + fProp_matrix.setZero(param->n_outputs(), minibatch_size); + } + if (param->n_inputs() != -1) + { + bProp_matrix.setZero(param->n_inputs(), minibatch_size); + } + } + + void resize() { resize(minibatch_size); } + + /* + void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + void Fprop(Matrix<double,1,Dynamic> & input,int n_cols) + { + param->fProp(input,fProp_matrix,0,0,n_cols); + } + */ + //for f prop, just call the fProp node of the particular parameter. }; diff --git a/src/model.cpp b/src/model.cpp index 3767f4b..db7f006 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -13,295 +13,295 @@ namespace nplm { void model::resize(int ngram_size, - int input_vocab_size, - int output_vocab_size, - int input_embedding_dimension, - int num_hidden, - int output_embedding_dimension) + int input_vocab_size, + int output_vocab_size, + int input_embedding_dimension, + int num_hidden, + int output_embedding_dimension) { - input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); - if (num_hidden == 0) - { - first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(output_embedding_dimension); - second_hidden_linear.resize(1,1); - second_hidden_activation.resize(1); - } - else - { - first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(num_hidden); - second_hidden_linear.resize(output_embedding_dimension, num_hidden); - second_hidden_activation.resize(output_embedding_dimension); - } - output_layer.resize(output_vocab_size, output_embedding_dimension); - this->ngram_size = ngram_size; - this->input_vocab_size = input_vocab_size; - this->output_vocab_size = output_vocab_size; - this->input_embedding_dimension = input_embedding_dimension; - this->num_hidden = num_hidden; - this->output_embedding_dimension = output_embedding_dimension; - premultiplied = false; + input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); + if (num_hidden == 0) + { + first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(output_embedding_dimension); + second_hidden_linear.resize(1,1); + second_hidden_activation.resize(1); + } + else + { + first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(num_hidden); + second_hidden_linear.resize(output_embedding_dimension, num_hidden); + second_hidden_activation.resize(output_embedding_dimension); + } + output_layer.resize(output_vocab_size, output_embedding_dimension); + this->ngram_size = ngram_size; + this->input_vocab_size = input_vocab_size; + this->output_vocab_size = output_vocab_size; + this->input_embedding_dimension = input_embedding_dimension; + this->num_hidden = num_hidden; + this->output_embedding_dimension = output_embedding_dimension; + premultiplied = false; } - -void model::initialize(mt19937 &init_engine, - bool init_normal, - double init_range, - double init_bias, - string ¶meter_update, - double adagrad_epsilon) + +void model::initialize(boost::random::mt19937 &init_engine, + bool init_normal, + double init_range, + double init_bias, + string ¶meter_update, + double adagrad_epsilon) { - input_layer.initialize(init_engine, - init_normal, - init_range, - parameter_update, - adagrad_epsilon); - output_layer.initialize(init_engine, - init_normal, - init_range, - init_bias, - parameter_update, - adagrad_epsilon); - first_hidden_linear.initialize(init_engine, - init_normal, - init_range, - parameter_update, - adagrad_epsilon); - second_hidden_linear.initialize(init_engine, - init_normal, - init_range, - parameter_update, - adagrad_epsilon); + input_layer.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); + output_layer.initialize(init_engine, + init_normal, + init_range, + init_bias, + parameter_update, + adagrad_epsilon); + first_hidden_linear.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); + second_hidden_linear.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); } void model::premultiply() { - // Since input and first_hidden_linear are both linear, - // we can multiply them into a single linear layer *if* we are not training - int context_size = ngram_size-1; - Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; - if (num_hidden == 0) - { - first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); - } - else - { - first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); - } - for (int i=0; i<context_size; i++) - first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); - input_layer.W->resize(1,1); // try to save some memory - premultiplied = true; + // Since input and first_hidden_linear are both linear, + // we can multiply them into a single linear layer *if* we are not training + int context_size = ngram_size-1; + Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; + if (num_hidden == 0) + { + first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); + } + else + { + first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + } + for (int i=0; i<context_size; i++) + first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); + input_layer.W->resize(1,1); // try to save some memory + premultiplied = true; } void model::readConfig(ifstream &config_file) { - string line; - vector<string> fields; - int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; - activation_function_type activation_function = this->activation_function; - while (getline(config_file, line) && line != "") + string line; + vector<string> fields; + int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; + activation_function_type activation_function = this->activation_function; + while (getline(config_file, line) && line != "") + { + splitBySpace(line, fields); + if (fields[0] == "ngram_size") + ngram_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "vocab_size") + input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "input_vocab_size") + input_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "output_vocab_size") + output_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "input_embedding_dimension") + input_embedding_dimension = lexical_cast<int>(fields[1]); + else if (fields[0] == "num_hidden") + num_hidden = lexical_cast<int>(fields[1]); + else if (fields[0] == "output_embedding_dimension") + output_embedding_dimension = lexical_cast<int>(fields[1]); + else if (fields[0] == "activation_function") + activation_function = string_to_activation_function(fields[1]); + else if (fields[0] == "version") { - splitBySpace(line, fields); - if (fields[0] == "ngram_size") - ngram_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "vocab_size") - input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "input_vocab_size") - input_vocab_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "output_vocab_size") - output_vocab_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "input_embedding_dimension") - input_embedding_dimension = lexical_cast<int>(fields[1]); - else if (fields[0] == "num_hidden") - num_hidden = lexical_cast<int>(fields[1]); - else if (fields[0] == "output_embedding_dimension") - output_embedding_dimension = lexical_cast<int>(fields[1]); - else if (fields[0] == "activation_function") - activation_function = string_to_activation_function(fields[1]); - else if (fields[0] == "version") - { - int version = lexical_cast<int>(fields[1]); - if (version != 1) - { - cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl; - exit(1); - } - } - else - cerr << "warning: unrecognized field in config: " << fields[0] << endl; + int version = lexical_cast<int>(fields[1]); + if (version != 1) + { + cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl; + exit(1); + } } - resize(ngram_size, - input_vocab_size, - output_vocab_size, - input_embedding_dimension, - num_hidden, - output_embedding_dimension); - set_activation_function(activation_function); + else + cerr << "warning: unrecognized field in config: " << fields[0] << endl; + } + resize(ngram_size, + input_vocab_size, + output_vocab_size, + input_embedding_dimension, + num_hidden, + output_embedding_dimension); + set_activation_function(activation_function); } void model::readConfig(const string &filename) { - ifstream config_file(filename.c_str()); - if (!config_file) - { - cerr << "error: could not open config file " << filename << endl; - exit(1); - } - readConfig(config_file); - config_file.close(); + ifstream config_file(filename.c_str()); + if (!config_file) + { + cerr << "error: could not open config file " << filename << endl; + exit(1); + } + readConfig(config_file); + config_file.close(); } - + void model::read(const string &filename) { - vector<string> input_words; - vector<string> output_words; - read(filename, input_words, output_words); + vector<string> input_words; + vector<string> output_words; + read(filename, input_words, output_words); } void model::read(const string &filename, vector<string> &words) { - vector<string> output_words; - read(filename, words, output_words); + vector<string> output_words; + read(filename, words, output_words); } void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words) { - ifstream file(filename.c_str()); - if (!file) throw runtime_error("Could not open file " + filename); - - param myParam; - string line; - - while (getline(file, line)) + ifstream file(filename.c_str()); + if (!file) throw runtime_error("Could not open file " + filename); + + param myParam; + string line; + + while (getline(file, line)) + { + if (line == "\\config") + { + readConfig(file); + } + + else if (line == "\\vocab") + { + input_words.clear(); + readWordsFile(file, input_words); + output_words = input_words; + } + + else if (line == "\\input_vocab") { - if (line == "\\config") - { - readConfig(file); - } - - else if (line == "\\vocab") - { - input_words.clear(); - readWordsFile(file, input_words); - output_words = input_words; - } - - else if (line == "\\input_vocab") - { - input_words.clear(); - readWordsFile(file, input_words); - } - - else if (line == "\\output_vocab") - { - output_words.clear(); - readWordsFile(file, output_words); - } - - else if (line == "\\input_embeddings") - input_layer.read(file); - else if (line == "\\hidden_weights 1") - first_hidden_linear.read_weights(file); - else if (line == "\\hidden_biases 1") - first_hidden_linear.read_biases (file); - else if (line == "\\hidden_weights 2") - second_hidden_linear.read_weights(file); - else if (line == "\\hidden_biases 2") - second_hidden_linear.read_biases (file); - else if (line == "\\output_weights") - output_layer.read_weights(file); - else if (line == "\\output_biases") - output_layer.read_biases(file); - else if (line == "\\end") - break; - else if (line == "") - continue; - else - { - cerr << "warning: unrecognized section: " << line << endl; - // skip over section - while (getline(file, line) && line != "") { } - } + input_words.clear(); + readWordsFile(file, input_words); } - file.close(); + + else if (line == "\\output_vocab") + { + output_words.clear(); + readWordsFile(file, output_words); + } + + else if (line == "\\input_embeddings") + input_layer.read(file); + else if (line == "\\hidden_weights 1") + first_hidden_linear.read_weights(file); + else if (line == "\\hidden_biases 1") + first_hidden_linear.read_biases (file); + else if (line == "\\hidden_weights 2") + second_hidden_linear.read_weights(file); + else if (line == "\\hidden_biases 2") + second_hidden_linear.read_biases (file); + else if (line == "\\output_weights") + output_layer.read_weights(file); + else if (line == "\\output_biases") + output_layer.read_biases(file); + else if (line == "\\end") + break; + else if (line == "") + continue; + else + { + cerr << "warning: unrecognized section: " << line << endl; + // skip over section + while (getline(file, line) && line != "") { } + } + } + file.close(); } void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words) -{ - write(filename, &input_words, &output_words); +{ + write(filename, &input_words, &output_words); } void model::write(const string &filename, const vector<string> &words) -{ - write(filename, &words, NULL); +{ + write(filename, &words, NULL); } -void model::write(const string &filename) -{ - write(filename, NULL, NULL); +void model::write(const string &filename) +{ + write(filename, NULL, NULL); } void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords) { - ofstream file(filename.c_str()); - if (!file) throw runtime_error("Could not open file " + filename); - - file << "\\config" << endl; - file << "version 1" << endl; - file << "ngram_size " << ngram_size << endl; - file << "input_vocab_size " << input_vocab_size << endl; - file << "output_vocab_size " << output_vocab_size << endl; - file << "input_embedding_dimension " << input_embedding_dimension << endl; - file << "num_hidden " << num_hidden << endl; - file << "output_embedding_dimension " << output_embedding_dimension << endl; - file << "activation_function " << activation_function_to_string(activation_function) << endl; - file << endl; - - if (input_pwords) - { - file << "\\input_vocab" << endl; - writeWordsFile(*input_pwords, file); - file << endl; - } + ofstream file(filename.c_str()); + if (!file) throw runtime_error("Could not open file " + filename); - if (output_pwords) - { - file << "\\output_vocab" << endl; - writeWordsFile(*output_pwords, file); - file << endl; - } + file << "\\config" << endl; + file << "version 1" << endl; + file << "ngram_size " << ngram_size << endl; + file << "input_vocab_size " << input_vocab_size << endl; + file << "output_vocab_size " << output_vocab_size << endl; + file << "input_embedding_dimension " << input_embedding_dimension << endl; + file << "num_hidden " << num_hidden << endl; + file << "output_embedding_dimension " << output_embedding_dimension << endl; + file << "activation_function " << activation_function_to_string(activation_function) << endl; + file << endl; - file << "\\input_embeddings" << endl; - input_layer.write(file); - file << endl; - - file << "\\hidden_weights 1" << endl; - first_hidden_linear.write_weights(file); + if (input_pwords) + { + file << "\\input_vocab" << endl; + writeWordsFile(*input_pwords, file); file << endl; + } - file << "\\hidden_biases 1" << endl; - first_hidden_linear.write_biases(file); - file <<endl; - - file << "\\hidden_weights 2" << endl; - second_hidden_linear.write_weights(file); + if (output_pwords) + { + file << "\\output_vocab" << endl; + writeWordsFile(*output_pwords, file); file << endl; + } - file << "\\hidden_biases 2" << endl; - second_hidden_linear.write_biases(file); - file << endl; - - file << "\\output_weights" << endl; - output_layer.write_weights(file); - file << endl; - - file << "\\output_biases" << endl; - output_layer.write_biases(file); - file << endl; - - file << "\\end" << endl; - file.close(); + file << "\\input_embeddings" << endl; + input_layer.write(file); + file << endl; + + file << "\\hidden_weights 1" << endl; + first_hidden_linear.write_weights(file); + file << endl; + + file << "\\hidden_biases 1" << endl; + first_hidden_linear.write_biases(file); + file <<endl; + + file << "\\hidden_weights 2" << endl; + second_hidden_linear.write_weights(file); + file << endl; + + file << "\\hidden_biases 2" << endl; + second_hidden_linear.write_biases(file); + file << endl; + + file << "\\output_weights" << endl; + output_layer.write_weights(file); + file << endl; + + file << "\\output_biases" << endl; + output_layer.write_biases(file); + file << endl; + + file << "\\end" << endl; + file.close(); } diff --git a/src/neuralClasses.h b/src/neuralClasses.h index 26dae06..458f80e 100644 --- a/src/neuralClasses.h +++ b/src/neuralClasses.h @@ -6,8 +6,7 @@ #include <cmath> #include <vector> -#include <boost/unordered_map.hpp> -//#include <../3rdparty/Eigen/Dense> +#include <boost/unordered_map.hpp> #include <Eigen/Dense> #include "maybe_omp.h" @@ -35,7 +34,7 @@ using Eigen::Dynamic; typedef boost::unordered_map<int,bool> int_map; struct Clipper{ - double operator() (double x) const { + double operator() (double x) const { return std::min(0.5, std::max(x,-0.5)); //return(x); } @@ -44,978 +43,997 @@ struct Clipper{ class Linear_layer { - private: - Matrix<double,Dynamic,Dynamic> U; - Matrix<double,Dynamic,Dynamic> U_gradient; - Matrix<double,Dynamic,Dynamic> U_velocity; - Matrix<double,Dynamic,Dynamic> U_running_gradient; - Matrix<double,Dynamic,Dynamic> U_running_parameter_update; - // Biases - Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,1> b_velocity; - Matrix<double,Dynamic,1> b_running_gradient; - Matrix<double,Dynamic,1> b_running_parameter_update; - Matrix<double,Dynamic,1> b_gradient; - - friend class model; - - public: - Linear_layer() { } - Linear_layer(int rows, int cols) { resize(rows, cols); } - - void resize(int rows, int cols) - { - U.setZero(rows, cols); - U_gradient.setZero(rows, cols); - //U_running_gradient.setZero(rows, cols); - //U_running_parameter_updates.setZero(rows, cols); - //U_velocity.setZero(rows, cols); - b.resize(rows); - b_gradient.setZero(rows); - //b_running_gradient.resize(rows); - //b_velocity.resize(rows); - } - - void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } - void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } + private: + Matrix<double,Dynamic,Dynamic> U; + Matrix<double,Dynamic,Dynamic> U_gradient; + Matrix<double,Dynamic,Dynamic> U_velocity; + Matrix<double,Dynamic,Dynamic> U_running_gradient; + Matrix<double,Dynamic,Dynamic> U_running_parameter_update; + // Biases + Matrix<double,Dynamic,1> b; + Matrix<double,Dynamic,1> b_velocity; + Matrix<double,Dynamic,1> b_running_gradient; + Matrix<double,Dynamic,1> b_running_parameter_update; + Matrix<double,Dynamic,1> b_gradient; + + friend class model; + + public: + Linear_layer() { } + Linear_layer(int rows, int cols) { resize(rows, cols); } + + void resize(int rows, int cols) + { + U.setZero(rows, cols); + U_gradient.setZero(rows, cols); + //U_running_gradient.setZero(rows, cols); + //U_running_parameter_updates.setZero(rows, cols); + //U_velocity.setZero(rows, cols); + b.resize(rows); + b_gradient.setZero(rows); + //b_running_gradient.resize(rows); + //b_velocity.resize(rows); + } + + void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } + void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); } void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } - template <typename Engine> - void initialize(Engine &engine, - bool init_normal, - double init_range, - string ¶meter_update, - double adagrad_epsilon) - { - if (parameter_update == "ADA") { - U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; - b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; - } - if (parameter_update == "ADAD") { - U_running_gradient.setZero(U.rows(),U.cols()); - b_running_gradient.setZero(b.size()); - U_running_parameter_update.setZero(U.rows(),U.cols()); - b_running_parameter_update.setZero(b.size()); - } + template <typename Engine> + void initialize(Engine &engine, + bool init_normal, + double init_range, + string ¶meter_update, + double adagrad_epsilon) + { + if (parameter_update == "ADA") { + U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; + b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + } + if (parameter_update == "ADAD") { + U_running_gradient.setZero(U.rows(),U.cols()); + b_running_gradient.setZero(b.size()); + U_running_parameter_update.setZero(U.rows(),U.cols()); + b_running_parameter_update.setZero(b.size()); + } - initMatrix(engine, U, init_normal, init_range); - initBias(engine, b, init_normal, init_range); - } + initMatrix(engine, U, init_normal, init_range); + initBias(engine, b, init_normal, init_range); + } - int n_inputs () const { return U.cols(); } - int n_outputs () const { return U.rows(); } + int n_inputs () const { return U.cols(); } + int n_outputs () const { return U.rows(); } template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const { - UNCONST(DerivedOut, output, my_output); - my_output.leftCols(input.cols()).noalias() = U*input; - int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) - { - my_output.leftCols(input.cols()).col(example) += b; - } + UNCONST(DerivedOut, output, my_output); + my_output.leftCols(input.cols()).noalias() = U*input; + int num_examples = input.cols(); + for (int example = 0;example < num_examples;example++) + { + my_output.leftCols(input.cols()).col(example) += b; + } } - // Sparse input + // Sparse input template <typename ScalarIn, typename DerivedOut> - void fProp(const USCMatrix<ScalarIn> &input, - const MatrixBase<DerivedOut> &output_const) const - { - UNCONST(DerivedOut, output_const, output); - output.setZero(); - uscgemm(1.0, U, input, output.leftCols(input.cols())); - // Each column corresponds to a training example. We - // parallelize the adding of biases per dimension. - int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) - { - output.leftCols(input.cols()).col(example) += b; - } + void fProp(const USCMatrix<ScalarIn> &input, + const MatrixBase<DerivedOut> &output_const) const + { + UNCONST(DerivedOut, output_const, output); + output.setZero(); + uscgemm(1.0, U, input, output.leftCols(input.cols())); + // Each column corresponds to a training example. We + // parallelize the adding of biases per dimension. + int num_examples = input.cols(); + for (int example = 0;example < num_examples;example++) + { + output.leftCols(input.cols()).col(example) += b; + } } template <typename DerivedGOut, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOut> &input, - MatrixBase<DerivedGIn> &output) const + void bProp(const MatrixBase<DerivedGOut> &input, + MatrixBase<DerivedGIn> &output) const { - UNCONST(DerivedGIn, output, my_output); - my_output.noalias() = U.transpose()*input; - } + UNCONST(DerivedGIn, output, my_output); + my_output.noalias() = U.transpose()*input; + } template <typename DerivedGOut, typename DerivedIn> - void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, double momentum, double L2_reg) - { - U_gradient.noalias() = bProp_input*fProp_input.transpose(); - - // get the bias gradient for all dimensions in parallel - int size = b.size(); - b_gradient = bProp_input.rowwise().sum(); - // This used to be multithreaded, but there was no measureable difference - if (L2_reg > 0.0) - { - U_gradient -= 2*L2_reg*U; - b_gradient -= 2*L2_reg*b; - } - if (momentum > 0.0) - { - U_velocity = momentum*U_velocity + U_gradient; - U += learning_rate * U_velocity; - b_velocity = momentum*b_velocity + b_gradient; - b += learning_rate * b_velocity; - } - else - { - U += learning_rate * U_gradient; - b += learning_rate * b_gradient; - /* - //UPDATE CLIPPING - U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); - b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); - //GRADIENT CLIPPING - //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix(); - //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); - */ - } - } + void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, double momentum, double L2_reg) + { + U_gradient.noalias() = bProp_input*fProp_input.transpose(); + + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient = bProp_input.rowwise().sum(); + // This used to be multithreaded, but there was no measureable difference + if (L2_reg > 0.0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + if (momentum > 0.0) + { + U_velocity = momentum*U_velocity + U_gradient; + U += learning_rate * U_velocity; + b_velocity = momentum*b_velocity + b_gradient; + b += learning_rate * b_velocity; + } + else + { + U += learning_rate * U_gradient; + b += learning_rate * b_gradient; + /* + //UPDATE CLIPPING + U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); + b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); + //GRADIENT CLIPPING + //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix(); + //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); + */ + } + } template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, - double L2_reg) + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, + double L2_reg) { - U_gradient.noalias() = bProp_input*fProp_input.transpose(); + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - - // get the bias gradient for all dimensions in parallel - int size = b.size(); - b_gradient.noalias() = bProp_input.rowwise().sum(); - if (L2_reg != 0) - { - U_gradient -= 2*L2_reg*U; - b_gradient -= 2*L2_reg*b; - } + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient.noalias() = bProp_input.rowwise().sum(); - // ignore momentum? - #pragma omp parallel for - for (int col=0; col<U.cols(); col++) { - U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix(); - U.col(col) += learning_rate * (U_gradient.col(col).array() / - U_running_gradient.col(col).array().sqrt()).matrix(); - /* - //UPDATE CLIPPING - U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())). - unaryExpr(Clipper()).matrix(); - */ - } - b_running_gradient += b_gradient.array().square().matrix(); - b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + if (L2_reg != 0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + + // ignore momentum? +#pragma omp parallel for + for (int col=0; col<U.cols(); col++) { + U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix(); + U.col(col) += learning_rate * (U_gradient.col(col).array() / + U_running_gradient.col(col).array().sqrt()).matrix(); /* //UPDATE CLIPPING - b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())). + unaryExpr(Clipper()).matrix(); */ + } + b_running_gradient += b_gradient.array().square().matrix(); + b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + */ } template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, - double L2_reg, - double conditioning_constant, - double decay) - { - //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl; - U_gradient.noalias() = bProp_input*fProp_input.transpose(); - - Array<double,Dynamic,1> b_current_parameter_update; - - // get the bias gradient for all dimensions in parallel - int size = b.size(); - b_gradient.noalias() = bProp_input.rowwise().sum(); - - if (L2_reg != 0) - { - U_gradient -= 2*L2_reg*U; - b_gradient -= 2*L2_reg*b; - } + void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, + double L2_reg, + double conditioning_constant, + double decay) + { + //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl; + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - // ignore momentum? - #pragma omp parallel for - //cerr<<"U gradient is "<<U_gradient<<endl; - for (int col=0; col<U.cols(); col++) { - Array<double,Dynamic,1> U_current_parameter_update; - U_running_gradient.col(col) = decay*U_running_gradient.col(col) + - (1-decay)*U_gradient.col(col).array().square().matrix(); - //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; - //getchar(); - U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/ - (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) * - U_gradient.col(col).array(); - //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl; - //getchar(); - //update the running parameter update - U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) + - (1.-decay)*U_current_parameter_update.square().matrix(); - U.col(col) += learning_rate*U_current_parameter_update.matrix(); - } - b_running_gradient = decay*b_running_gradient + - (1.-decay)*b_gradient.array().square().matrix(); - b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ - (b_running_gradient.array()+conditioning_constant).sqrt()) * - b_gradient.array(); - b_running_parameter_update = decay*(b_running_parameter_update) + - (1.-decay)*b_current_parameter_update.square().matrix(); - b += learning_rate*b_current_parameter_update.matrix(); + Array<double,Dynamic,1> b_current_parameter_update; + + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + + if (L2_reg != 0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + + // ignore momentum? +#pragma omp parallel for + //cerr<<"U gradient is "<<U_gradient<<endl; + for (int col=0; col<U.cols(); col++) { + Array<double,Dynamic,1> U_current_parameter_update; + U_running_gradient.col(col) = decay*U_running_gradient.col(col) + + (1-decay)*U_gradient.col(col).array().square().matrix(); + //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; + //getchar(); + U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/ + (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) * + U_gradient.col(col).array(); + //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl; + //getchar(); + //update the running parameter update + U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) + + (1.-decay)*U_current_parameter_update.square().matrix(); + U.col(col) += learning_rate*U_current_parameter_update.matrix(); + } + b_running_gradient = decay*b_running_gradient + + (1.-decay)*b_gradient.array().square().matrix(); + b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ + (b_running_gradient.array()+conditioning_constant).sqrt()) * + b_gradient.array(); + b_running_parameter_update = decay*(b_running_parameter_update) + + (1.-decay)*b_current_parameter_update.square().matrix(); + b += learning_rate*b_current_parameter_update.matrix(); } template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> - void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - const MatrixBase<DerivedGW> &gradient) const + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &fProp_input, + const MatrixBase<DerivedGW> &gradient) const { - UNCONST(DerivedGW, gradient, my_gradient); - my_gradient.noalias() = bProp_input*fProp_input.transpose(); + UNCONST(DerivedGW, gradient, my_gradient); + my_gradient.noalias() = bProp_input*fProp_input.transpose(); } }; class Output_word_embeddings { - private: - // row-major is better for uscgemm - //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W; - // Having W be a pointer to a matrix allows ease of sharing - // input and output word embeddings - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; - std::vector<double> W_data; - Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; - Matrix<double,Dynamic,1> b_running_gradient; - Matrix<double,Dynamic,1> b_gradient; - Matrix<double,Dynamic,1> b_running_parameter_update; - - public: - Output_word_embeddings() { } - Output_word_embeddings(int rows, int cols) { resize(rows, cols); } - - void resize(int rows, int cols) - { - W->setZero(rows, cols); - b.setZero(rows); - } - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { - W = input_W; - } - void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); } - void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); } - void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } - void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } - - template <typename Engine> - void initialize(Engine &engine, - bool init_normal, - double init_range, - double init_bias, - string ¶meter_update, - double adagrad_epsilon) - { + private: + // row-major is better for uscgemm + //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W; + // Having W be a pointer to a matrix allows ease of sharing + // input and output word embeddings + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + std::vector<double> W_data; + Matrix<double,Dynamic,1> b; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<double,Dynamic,1> b_running_gradient; + Matrix<double,Dynamic,1> b_gradient; + Matrix<double,Dynamic,1> b_running_parameter_update; + + public: + Output_word_embeddings() { } + Output_word_embeddings(int rows, int cols) { resize(rows, cols); } + + void resize(int rows, int cols) + { + W->setZero(rows, cols); + b.setZero(rows); + } + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } + void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); } + void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); } + void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } + void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } - W_gradient.setZero(W->rows(),W->cols()); - b_gradient.setZero(b.size()); - if (parameter_update == "ADA") { - W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; - b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; - //W_gradient.setZero(W->rows(),W->cols()); - //b_gradient.setZero(b.size()); - } - if (parameter_update == "ADAD") { - W_running_gradient.setZero(W->rows(),W->cols()); - b_running_gradient.setZero(b.size()); - W_gradient.setZero(W->rows(),W->cols()); - //b_gradient.setZero(b.size()); - //W_running_parameter_update.setZero(W->rows(),W->cols()); - b_running_parameter_update.setZero(b.size()); - } - - initMatrix(engine, *W, init_normal, init_range); - b.fill(init_bias); - } - - int n_inputs () const { return W->cols(); } - int n_outputs () const { return W->rows(); } - - template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const - { - UNCONST(DerivedOut, output, my_output); - my_output = ((*W) * input).colwise() + b; - } - - // Sparse output version - template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOutI> &samples, - const MatrixBase<DerivedOutV> &output) const - { - UNCONST(DerivedOutV, output, my_output); - #pragma omp parallel for - for (int instance_id = 0; instance_id < samples.cols(); instance_id++) - { - for (int sample_id = 0; sample_id < samples.rows(); sample_id++) - { - my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); - } - } - USCMatrix<double> sparse_output(W->rows(), samples, my_output); - uscgemm_masked(1.0, *W, input, sparse_output); - my_output = sparse_output.values; // too bad, so much copying - } - - // Return single element of output matrix - template <typename DerivedIn> - double fProp(const MatrixBase<DerivedIn> &input, - int word, - int instance) const - { - return W->row(word).dot(input.col(instance)) + b(word); + template <typename Engine> + void initialize(Engine &engine, + bool init_normal, + double init_range, + double init_bias, + string ¶meter_update, + double adagrad_epsilon) + { + + W_gradient.setZero(W->rows(),W->cols()); + b_gradient.setZero(b.size()); + if (parameter_update == "ADA") { + W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + //W_gradient.setZero(W->rows(),W->cols()); + //b_gradient.setZero(b.size()); + } + if (parameter_update == "ADAD") { + W_running_gradient.setZero(W->rows(),W->cols()); + b_running_gradient.setZero(b.size()); + W_gradient.setZero(W->rows(),W->cols()); + //b_gradient.setZero(b.size()); + //W_running_parameter_update.setZero(W->rows(),W->cols()); + b_running_parameter_update.setZero(b.size()); } - // Dense versions (for log-likelihood loss) + initMatrix(engine, *W, init_normal, init_range); + b.fill(init_bias); + } + + int n_inputs () const { return W->cols(); } + int n_outputs () const { return W->rows(); } - template <typename DerivedGOut, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix, - const MatrixBase<DerivedGIn> &bProp_matrix) const + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + my_output = ((*W) * input).colwise() + b; + /* TODO: without EIGEN_NO_DEBUG - is this a bug? + ProductBase.h:102: Eigen::ProductBase<Derived, Lhs, Rhs>::ProductBase(const Lhs& + , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<double, -1, -1 + , 1>, Eigen::Matrix<double, -1, -1>, 5>; Lhs = Eigen::Matrix<double, -1, -1, 1>; + Rhs = Eigen::Matrix<double, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() & + & "invalid matrix product" && "if you wanted a coeff-wise or a dot product use t + he respective explicit functions"' failed. + + (gdb) p a_lhs.cols() + $3 = 50 + (gdb) p a_rhs.rows() + $4 = 100 + + (gdb) p a_lhs.rows() + $5 = 2 + (gdb) p a_rhs.cols() + $6 = 1 + + from lookup_ngram normalization prop.skip_hidden in neuralNetwork.h:100 + */ + } + + // Sparse output version + template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOutI> &samples, + const MatrixBase<DerivedOutV> &output) const + { + UNCONST(DerivedOutV, output, my_output); +#pragma omp parallel for + for (int instance_id = 0; instance_id < samples.cols(); instance_id++) { - // W is vocab_size x output_embedding_dimension - // input_bProp_matrix is vocab_size x minibatch_size - // bProp_matrix is output_embedding_dimension x minibatch_size - UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); - my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = + for (int sample_id = 0; sample_id < samples.rows(); sample_id++) + { + my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); + } + } + USCMatrix<double> sparse_output(W->rows(), samples, my_output); + uscgemm_masked(1.0, *W, input, sparse_output); + my_output = sparse_output.values; // too bad, so much copying + } + + // Return single element of output matrix + template <typename DerivedIn> + double fProp(const MatrixBase<DerivedIn> &input, + int word, + int instance) const + { + return W->row(word).dot(input.col(instance)) + b(word); + } + + // Dense versions (for log-likelihood loss) + + template <typename DerivedGOut, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix, + const MatrixBase<DerivedGIn> &bProp_matrix) const + { + // W is vocab_size x output_embedding_dimension + // input_bProp_matrix is vocab_size x minibatch_size + // bProp_matrix is output_embedding_dimension x minibatch_size + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = W->transpose() * input_bProp_matrix; - } + } - template <typename DerivedIn, typename DerivedGOut> - void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate, - double momentum) //not sure if we want to use momentum here - { - // W is vocab_size x output_embedding_dimension - // b is vocab_size x 1 - // predicted_embeddings is output_embedding_dimension x minibatch_size - // bProp_input is vocab_size x minibatch_size - W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose(); - b += learning_rate * bProp_input.rowwise().sum(); - - /* - //GRADIENT CLIPPING - W->noalias() += learning_rate * - ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); - b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); - //UPDATE CLIPPING - W->noalias() += (learning_rate * - (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); - b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); - */ - } - - template <typename DerivedIn, typename DerivedGOut> - void computeGradientAdagrad( - const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate) //not sure if we want to use momentum here - { - // W is vocab_size x output_embedding_dimension - // b is vocab_size x 1 - // predicted_embeddings is output_embedding_dimension x minibatch_size - // bProp_input is vocab_size x minibatch_sizea - W_gradient.setZero(W->rows(), W->cols()); - b_gradient.setZero(b.size()); - W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); - b_gradient.noalias() = bProp_input.rowwise().sum(); - W_running_gradient += W_gradient.array().square().matrix(); - b_running_gradient += b_gradient.array().square().matrix(); - W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix(); - b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); - /* - //UPDATE CLIPPING - *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); - b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); - */ - } - - template <typename DerivedIn, typename DerivedGOut> - void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate, - double conditioning_constant, - double decay) //not sure if we want to use momentum here - { - // W is vocab_size x output_embedding_dimension - // b is vocab_size x 1 - // predicted_embeddings is output_embedding_dimension x minibatch_size - // bProp_input is vocab_size x minibatch_size - Array<double,Dynamic,Dynamic> W_current_parameter_update; - Array<double,Dynamic,1> b_current_parameter_update; - W_gradient.setZero(W->rows(), W->cols()); - b_gradient.setZero(b.size()); - W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); - b_gradient.noalias() = bProp_input.rowwise().sum(); - W_running_gradient = decay*W_running_gradient + - (1.-decay)*W_gradient.array().square().matrix(); - b_running_gradient = decay*b_running_gradient+ - (1.-decay)*b_gradient.array().square().matrix(); - W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/ - (W_running_gradient.array()+conditioning_constant).sqrt())* - W_gradient.array(); - b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ - (b_running_gradient.array()+conditioning_constant).sqrt())* - b_gradient.array(); - W_running_parameter_update = decay*W_running_parameter_update + - (1.-decay)*W_current_parameter_update.square().matrix(); - b_running_parameter_update = decay*b_running_parameter_update + - (1.-decay)*b_current_parameter_update.square().matrix(); - - *W += learning_rate*W_current_parameter_update.matrix(); - b += learning_rate*b_current_parameter_update.matrix(); - } - - // Sparse versions - - template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - const MatrixBase<DerivedGIn> &bProp_matrix) const - { - UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); - my_bProp_matrix.setZero(); - uscgemm(1.0, - W->transpose(), + template <typename DerivedIn, typename DerivedGOut> + void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate, + double momentum) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_size + W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose(); + b += learning_rate * bProp_input.rowwise().sum(); + + /* + //GRADIENT CLIPPING + W->noalias() += learning_rate * + ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); + b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); + //UPDATE CLIPPING + W->noalias() += (learning_rate * + (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); + b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); + */ + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradientAdagrad( + const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_sizea + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + W_running_gradient += W_gradient.array().square().matrix(); + b_running_gradient += b_gradient.array().square().matrix(); + W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix(); + b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + */ + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate, + double conditioning_constant, + double decay) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_size + Array<double,Dynamic,Dynamic> W_current_parameter_update; + Array<double,Dynamic,1> b_current_parameter_update; + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + W_running_gradient = decay*W_running_gradient + + (1.-decay)*W_gradient.array().square().matrix(); + b_running_gradient = decay*b_running_gradient+ + (1.-decay)*b_gradient.array().square().matrix(); + W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/ + (W_running_gradient.array()+conditioning_constant).sqrt())* + W_gradient.array(); + b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ + (b_running_gradient.array()+conditioning_constant).sqrt())* + b_gradient.array(); + W_running_parameter_update = decay*W_running_parameter_update + + (1.-decay)*W_current_parameter_update.square().matrix(); + b_running_parameter_update = decay*b_running_parameter_update + + (1.-decay)*b_current_parameter_update.square().matrix(); + + *W += learning_rate*W_current_parameter_update.matrix(); + b += learning_rate*b_current_parameter_update.matrix(); + } + + // Sparse versions + + template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + const MatrixBase<DerivedGIn> &bProp_matrix) const + { + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.setZero(); + uscgemm(1.0, + W->transpose(), USCMatrix<double>(W->rows(), samples, weights), my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch + } + + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, double momentum) //not sure if we want to use momentum here + { + //cerr<<"in gradient"<<endl; + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(learning_rate, + gradient_output, + predicted_embeddings.leftCols(gradient_output.cols()).transpose(), + *W); // narrow predicted_embeddings for possible short minibatch + uscgemv(learning_rate, + gradient_output, + Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), + b); + /* + //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT + //FIRST + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + //#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + //W->row(update_item) += learning_rate * W_gradient.row(update_item); + //b(update_item) += learning_rate * b_gradient(update_item); + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix(); + double update = learning_rate * b_gradient(update_item); + b(update_item) += std::min(0.5, std::max(update,-0.5)); + //GRADIENT CLIPPING + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; } + */ + //cerr<<"Finished gradient"<<endl; + } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> - void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate, double momentum) //not sure if we want to use momentum here - { - //cerr<<"in gradient"<<endl; - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(learning_rate, - gradient_output, - predicted_embeddings.leftCols(gradient_output.cols()).transpose(), - *W); // narrow predicted_embeddings for possible short minibatch - uscgemv(learning_rate, - gradient_output, - Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), - b); + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate) //not sure if we want to use momentum here + { + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); + //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + //#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); + b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item); + W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); + b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); /* - //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT - //FIRST - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - W_gradient); - uscgemv(1.0, - gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), - b_gradient); - - int_map update_map; //stores all the parameters that have been updated - for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; - - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - update_items.push_back(it->first); - int num_items = update_items.size(); - - //#pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - //W->row(update_item) += learning_rate * W_gradient.row(update_item); - //b(update_item) += learning_rate * b_gradient(update_item); - //UPDATE CLIPPING - W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix(); - double update = learning_rate * b_gradient(update_item); - b(update_item) += std::min(0.5, std::max(update,-0.5)); - //GRADIENT CLIPPING - W_gradient.row(update_item).setZero(); - b_gradient(update_item) = 0.; - } - */ - //cerr<<"Finished gradient"<<endl; - } - - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> - void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate) //not sure if we want to use momentum here - { - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); - //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - W_gradient); - uscgemv(1.0, - gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), - b_gradient); - - int_map update_map; //stores all the parameters that have been updated - for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; - - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - update_items.push_back(it->first); - int num_items = update_items.size(); - - //#pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); - b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item); - W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); - b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); - /* - //UPDATE CLIPPING - W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix(); - double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); - b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5)); - */ - W_gradient.row(update_item).setZero(); - b_gradient(update_item) = 0.; - } - } + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix(); + double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5)); + */ + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; + } + } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> - void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate, - double conditioning_constant, - double decay) //not sure if we want to use momentum here - { - //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); - - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - W_gradient); - uscgemv(1.0, - gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), - b_gradient); - - int_map update_map; //stores all the parameters that have been updated - for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; - - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - update_items.push_back(it->first); - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - Array<double,1,Dynamic> W_current_parameter_update; - double b_current_parameter_update; - - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ - (1.-decay)*W_gradient.row(update_item).array().square().matrix(); - b_running_gradient(update_item) = decay*b_running_gradient(update_item)+ - (1.-decay)*b_gradient(update_item)*b_gradient(update_item); - //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl; - //getchar(); - - //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl; - //getchar(); - W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ - (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* - W_gradient.row(update_item).array(); - b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/ - sqrt(b_running_gradient(update_item)+conditioning_constant))* - b_gradient(update_item); - //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl; - //getchar(); - //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl; - //getchar(); - //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl; - W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ - (1.-decay)*(W_current_parameter_update.square().matrix()); - b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+ - (1.-decay)*b_current_parameter_update*b_current_parameter_update; - //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl; - //getchar(); - W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); - b(update_item) += learning_rate*b_current_parameter_update; - W_gradient.row(update_item).setZero(); - b_gradient(update_item) = 0.; - } - } + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> + void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, + double conditioning_constant, + double decay) //not sure if we want to use momentum here + { + //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); + + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + +#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + Array<double,1,Dynamic> W_current_parameter_update; + double b_current_parameter_update; + + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ + (1.-decay)*W_gradient.row(update_item).array().square().matrix(); + b_running_gradient(update_item) = decay*b_running_gradient(update_item)+ + (1.-decay)*b_gradient(update_item)*b_gradient(update_item); + //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl; + //getchar(); + + //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl; + //getchar(); + W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ + (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* + W_gradient.row(update_item).array(); + b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/ + sqrt(b_running_gradient(update_item)+conditioning_constant))* + b_gradient(update_item); + //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl; + //getchar(); + //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl; + //getchar(); + //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl; + W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ + (1.-decay)*(W_current_parameter_update.square().matrix()); + b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+ + (1.-decay)*b_current_parameter_update*b_current_parameter_update; + //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl; + //getchar(); + W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); + b(update_item) += learning_rate*b_current_parameter_update; + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; + } + } - template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> - void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - const MatrixBase<DerivedGW> &gradient_W, - const MatrixBase<DerivedGb> &gradient_b) const - { - UNCONST(DerivedGW, gradient_W, my_gradient_W); - UNCONST(DerivedGb, gradient_b, my_gradient_b); - my_gradient_W.setZero(); - my_gradient_b.setZero(); - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - my_gradient_W); - uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); + template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> + void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + const MatrixBase<DerivedGW> &gradient_W, + const MatrixBase<DerivedGb> &gradient_b) const + { + UNCONST(DerivedGW, gradient_W, my_gradient_W); + UNCONST(DerivedGb, gradient_b, my_gradient_b); + my_gradient_W.setZero(); + my_gradient_b.setZero(); + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + my_gradient_W); + uscgemv(1.0, gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); } }; class Input_word_embeddings { - private: - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; - int context_size, vocab_size; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; - - friend class model; - - public: - Input_word_embeddings() : context_size(0), vocab_size(0) { } - Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { - W = input_W; - } + private: + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + int context_size, vocab_size; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; - void resize(int rows, int cols, int context) - { - context_size = context; - vocab_size = rows; - W->setZero(rows, cols); - } + friend class model; - void read(std::ifstream &W_file) { readMatrix(W_file, *W); } - void write(std::ofstream &W_file) { writeMatrix(*W, W_file); } + public: + Input_word_embeddings() : context_size(0), vocab_size(0) { } + Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - template <typename Engine> - void initialize(Engine &engine, - bool init_normal, - double init_range, - string ¶meter_update, - double adagrad_epsilon) - { - W_gradient.setZero(W->rows(),W->cols()); - - if (parameter_update == "ADA") { - W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; - //W_gradient.setZero(W->rows(),W->cols()); - } - if (parameter_update == "ADAD") { - W_running_gradient.setZero(W->rows(),W->cols()); - //W_gradient.setZero(W->rows(),W->cols()); - W_running_parameter_update.setZero(W->rows(),W->cols()); - } - initMatrix(engine, - *W, - init_normal, - init_range); - } + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } - int n_inputs() const { return -1; } - int n_outputs() const { return W->cols() * context_size; } - - // set output_id's embedding to the weighted average of all embeddings - template <typename Dist> - void average(const Dist &dist, int output_id) - { - W->row(output_id).setZero(); - for (int i=0; i < W->rows(); i++) - if (i != output_id) - W->row(output_id) += dist.prob(i) * W->row(i); - } - - template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const - { - int embedding_dimension = W->cols(); - - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // output is ngram_size*embedding_dimension x minibatch_size - - /* - // Dense version: - for (int ngram=0; ngram<context_size; ngram++) - output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); - */ - - UNCONST(DerivedOut, output, my_output); - my_output.setZero(); - for (int ngram=0; ngram<context_size; ngram++) - { - // input might be narrower than expected due to a short minibatch, - // so narrow output to match - uscgemm(1.0, - W->transpose(), - USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), - my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); - } - } - - // When model is premultiplied, this layer doesn't get used, - // but this method is used to get the input into a sparse matrix. - // Hopefully this can get eliminated someday - template <typename DerivedIn, typename ScalarOut> - void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const - { - output.resize(vocab_size*context_size, context_size, input.cols()); - for (int i=0; i < context_size; i++) - output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; - output.values.fill(1.0); - } + void resize(int rows, int cols, int context) + { + context_size = context; + vocab_size = rows; + W->setZero(rows, cols); + } + + void read(std::ifstream &W_file) { readMatrix(W_file, *W); } + void write(std::ofstream &W_file) { writeMatrix(*W, W_file); } + + template <typename Engine> + void initialize(Engine &engine, + bool init_normal, + double init_range, + string ¶meter_update, + double adagrad_epsilon) + { + W_gradient.setZero(W->rows(),W->cols()); + + if (parameter_update == "ADA") { + W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + //W_gradient.setZero(W->rows(),W->cols()); + } + if (parameter_update == "ADAD") { + W_running_gradient.setZero(W->rows(),W->cols()); + //W_gradient.setZero(W->rows(),W->cols()); + W_running_parameter_update.setZero(W->rows(),W->cols()); + } + initMatrix(engine, + *W, + init_normal, + init_range); + } + + int n_inputs() const { return -1; } + int n_outputs() const { return W->cols() * context_size; } + + // set output_id's embedding to the weighted average of all embeddings + template <typename Dist> + void average(const Dist &dist, int output_id) + { + W->row(output_id).setZero(); + for (int i=0; i < W->rows(); i++) + if (i != output_id) + W->row(output_id) += dist.prob(i) * W->row(i); + } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + int embedding_dimension = W->cols(); + + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // output is ngram_size*embedding_dimension x minibatch_size + + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); + */ + + UNCONST(DerivedOut, output, my_output); + my_output.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + { + // input might be narrower than expected due to a short minibatch, + // so narrow output to match + uscgemm(1.0, + W->transpose(), + USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), + my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); + } + } + + // When model is premultiplied, this layer doesn't get used, + // but this method is used to get the input into a sparse matrix. + // Hopefully this can get eliminated someday + template <typename DerivedIn, typename ScalarOut> + void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const + { + output.resize(vocab_size*context_size, context_size, input.cols()); + for (int i=0; i < context_size; i++) + output.indexes.row(i).array() = input.row(i).array() + i*vocab_size; + output.values.fill(1.0); + } template <typename DerivedGOut, typename DerivedIn> void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, double momentum, double L2_reg) + const MatrixBase<DerivedIn> &input_words, + double learning_rate, double momentum, double L2_reg) { - int embedding_dimension = W->cols(); + int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // bProp_input is ngram_size*embedding_dimension x minibatch_size + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // bProp_input is ngram_size*embedding_dimension x minibatch_size - /* - // Dense version: - for (int ngram=0; ngram<context_size; ngram++) - W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() - */ + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() + */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(learning_rate, - USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), - *W); - } + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(learning_rate, + USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), + *W); + } - /* - //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN - //PERFORM CLIPPING WHILE UPDATING - - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - W_gradient); - } - int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(ngram,train_id)] = 1; - } - } + /* + //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN + //PERFORM CLIPPING WHILE UPDATING - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - //UPDATE CLIPPING - W->row(update_item) += (learning_rate* - W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix(); - //GRADIENT CLIPPING - //W->row(update_item) += learning_rate* - // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix(); - //SETTING THE GRADIENT TO ZERO - W_gradient.row(update_item).setZero(); - } - */ - } + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; + } + } - template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, - double L2_reg) + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { - int embedding_dimension = W->cols(); - //W_gradient.setZero(W->rows(), W->cols()); - /* + update_items.push_back(it->first); + } + int num_items = update_items.size(); + + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + //UPDATE CLIPPING + W->row(update_item) += (learning_rate* + W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix(); + //GRADIENT CLIPPING + //W->row(update_item) += learning_rate* + // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix(); + //SETTING THE GRADIENT TO ZERO + W_gradient.row(update_item).setZero(); + } + */ + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, + double L2_reg) + { + int embedding_dimension = W->cols(); + //W_gradient.setZero(W->rows(), W->cols()); + /* if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) - W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; - */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - W_gradient); - } - int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(ngram,train_id)] = 1; - } + W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + */ + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; } + } - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); - W->row(update_item) += learning_rate * - (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); - /* - //UPDATE CLIPPING - W->row(update_item) += (learning_rate * - (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) - .unaryExpr(Clipper()).matrix(); - */ - W_gradient.row(update_item).setZero(); - } - } - - template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, - double L2_reg, - double conditioning_constant, - double decay) + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { - int embedding_dimension = W->cols(); + update_items.push_back(it->first); + } + int num_items = update_items.size(); - //W_gradient.setZero(W->rows(), W->cols()); +#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); + W->row(update_item) += learning_rate * + (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); /* - if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) - W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * + (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) + .unaryExpr(Clipper()).matrix(); */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - W_gradient); - } - int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(ngram,train_id)] = 1; - } + W_gradient.row(update_item).setZero(); + } + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, + double L2_reg, + double conditioning_constant, + double decay) + { + int embedding_dimension = W->cols(); + + //W_gradient.setZero(W->rows(), W->cols()); + /* + if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) + W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + */ + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; } + } - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - - Array<double,1,Dynamic> W_current_parameter_update; - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ - (1.-decay)*W_gradient.row(update_item).array().square().matrix(); - - W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ - (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* - W_gradient.row(update_item).array(); - - //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl; - //getchar(); - W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ - (1.-decay)*W_current_parameter_update.square().matrix(); - - W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); - //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl; - //getchar(); - W_gradient.row(update_item).setZero(); - } - - } - - template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> - void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - int x, int minibatch_size, - const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { - UNCONST(DerivedGW, gradient, my_gradient); - int embedding_dimension = W->cols(); - my_gradient.setZero(); - for (int ngram=0; ngram<context_size; ngram++) - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - my_gradient); + update_items.push_back(it->first); } + int num_items = update_items.size(); + +#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + + Array<double,1,Dynamic> W_current_parameter_update; + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ + (1.-decay)*W_gradient.row(update_item).array().square().matrix(); + + W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ + (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* + W_gradient.row(update_item).array(); + + //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl; + //getchar(); + W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ + (1.-decay)*W_current_parameter_update.square().matrix(); + + W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); + //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl; + //getchar(); + W_gradient.row(update_item).setZero(); + } + + } + + template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + int x, int minibatch_size, + const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + { + UNCONST(DerivedGW, gradient, my_gradient); + int embedding_dimension = W->cols(); + my_gradient.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + my_gradient); + } }; } // namespace nplm - diff --git a/src/neuralLM.h b/src/neuralLM.h index 2004596..f0eebd8 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -6,119 +6,138 @@ #include <cstdlib> #include <boost/shared_ptr.hpp> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "util.h" #include "vocabulary.h" #include "neuralNetwork.h" +#include "replace_digits.hpp" /* To do: - move digit mapping into vocabulary.h - */ +*/ namespace nplm { -class neuralLM : public neuralNetwork +class neuralLM : public neuralNetwork, graehl::replace_digits { - char map_digits; - boost::shared_ptr<vocabulary> vocab; - int start, null; + boost::shared_ptr<vocabulary> vocab; + int start, null; -public: - neuralLM() + public: + neuralLM() : neuralNetwork(), - vocab(new vocabulary()), - map_digits(0) - { - } + graehl::replace_digits(0), + vocab(new vocabulary()) + { + } - void set_map_digits(char value) { map_digits = value; } + void set_map_digits(char value) { map_digits = value; } - void set_vocabulary(const vocabulary &vocab) - { - *(this->vocab) = vocab; - start = vocab.lookup_word("<s>"); - null = vocab.lookup_word("<null>"); - } + void set_vocabulary(const vocabulary &vocab) + { + *(this->vocab) = vocab; + start = vocab.lookup_word("<s>"); + null = vocab.lookup_word("<null>"); + } - const vocabulary &get_vocabulary() const { return *(this->vocab); } + const vocabulary &get_vocabulary() const { return *(this->vocab); } - int lookup_word(const std::string &word) const - { - if (map_digits) - for (int i=0; i<word.length(); i++) - if (isdigit(word[i])) - { - std::string mapped_word(word); - for (; i<word.length(); i++) - if (isdigit(word[i])) - mapped_word[i] = map_digits; - return vocab->lookup_word(mapped_word); - } - return vocab->lookup_word(word); - } + int lookup_input_word(const std::string &word) const + { + return lookup_word(word); + } - double lookup_ngram(const int *ngram_a, int n) - { - Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); - for (int i=0; i<m->ngram_size; i++) - { - if (i-m->ngram_size+n < 0) - { - if (ngram_a[0] == start) - ngram(i) = start; - else - ngram(i) = null; - } - else - { - ngram(i) = ngram_a[i-m->ngram_size+n]; - } - } - return neuralNetwork::lookup_ngram(ngram); - } + int lookup_input_word(std::pair<char const*, char const*> word) const + { + return lookup_word(word); + } - double lookup_ngram(const std::vector<int> &ngram_v) - { - return lookup_ngram(ngram_v.data(), ngram_v.size()); - } - template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) - { - return neuralNetwork::lookup_ngram(ngram); - } - - template <typename DerivedA, typename DerivedB> - void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) - { - return neuralNetwork::lookup_ngram(ngram, log_probs_const); - } + int lookup_word(const std::string &word) const + { + if (map_digits) + for (int i=0, n=word.size(); i<n; ++i) + if (graehl::ascii_digit(word[i])) { + std::string mapped_word(word); + replace(mapped_word, i); + return vocab->lookup_word(mapped_word); + } + return vocab->lookup_word(word); + } + + int lookup_word(std::pair<char const*, char const*> slice) const + { + if (map_digits) + for (char const* i = slice.first; i != slice.second; ++i) + if (graehl::ascii_digit(*i)) { + std::string mapped_word(slice.first, slice.second); + replace(mapped_word, i - slice.first); + return vocab->lookup_word(mapped_word); + } + return vocab->lookup_word(slice); + } - void read(const std::string &filename) + double lookup_ngram(const int *ngram_a, int n) + { + Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); + for (int i=0; i<m->ngram_size; ++i) { - std::vector<std::string> words; - m->read(filename, words); - set_vocabulary(vocabulary(words)); - resize(); - // this is faster but takes more memory - //m->premultiply(); + if (i-m->ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-m->ngram_size+n]; + } } + return neuralNetwork::lookup_ngram(ngram); + } + + double lookup_ngram(const std::vector<int> &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + return neuralNetwork::lookup_ngram(ngram); + } + + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); + } + + void read(const std::string &filename) + { + std::vector<std::string> words; + m->read(filename, words); + set_vocabulary(vocabulary(words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } }; template <typename T> void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop) { - output.clear(); - output.resize(input.size()+ngram_size); - for (int i=0; i<ngram_size-1; i++) - output[i] = start; - std::copy(input.begin(), input.end(), output.begin()+ngram_size-1); - output[output.size()-1] = stop; + output.clear(); + output.resize(input.size()+ngram_size); + for (int i=0; i<ngram_size-1; ++i) + output[i] = start; + std::copy(input.begin(), input.end(), output.begin()+ngram_size-1); + output[output.size()-1] = stop; } template <typename T> @@ -127,21 +146,21 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu output.clear(); for (int j=ngram_size-1; j<input.size(); j++) { - std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); - output.push_back(ngram); + std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1); + output.push_back(ngram); } } -inline void preprocessWords(const std::vector<std::string> &words, - std::vector< std::vector<int> > &ngrams, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize) { +inline void preprocessWords(const std::vector<std::string> &words, + std::vector< std::vector<int> > &ngrams, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize) { int start = vocab.lookup_word("<s>"); int stop = vocab.lookup_word("</s>"); - + // convert words to ints std::vector<int> nums; if (numberize) { @@ -152,9 +171,9 @@ inline void preprocessWords(const std::vector<std::string> &words, else { for (int j=0; j<words.size(); j++) { nums.push_back(boost::lexical_cast<int>(words[j])); - } + } } - + // convert sequence to n-grams ngrams.clear(); if (ngramize) { @@ -168,10 +187,10 @@ inline void preprocessWords(const std::vector<std::string> &words, } else { if (nums.size() != ngram_size) - { - std::cerr << "error: wrong number of fields in line" << std::endl; - std::exit(1); - } + { + std::cerr << "error: wrong number of fields in line\n"; + std::exit(1); + } ngrams.push_back(nums); } } diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h index ef96488..6386a0f 100644 --- a/src/neuralNetwork.h +++ b/src/neuralNetwork.h @@ -3,7 +3,6 @@ #include <vector> #include <boost/shared_ptr.hpp> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "util.h" @@ -16,191 +15,191 @@ namespace nplm class neuralNetwork { -protected: - boost::shared_ptr<model> m; + protected: + boost::shared_ptr<model> m; -private: - bool normalization; - double weight; + private: + bool normalization; + double weight; - propagator prop; + propagator prop; - std::size_t cache_size; - Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; - std::vector<double> cache_values; - int cache_lookups, cache_hits; + std::size_t cache_size; + Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; + std::vector<double> cache_values; + int cache_lookups, cache_hits; -public: - neuralNetwork() + public: + neuralNetwork() : m(new model()), normalization(false), - weight(1.), - prop(*m, 1), + weight(1.), + prop(*m, 1), cache_size(0) - { - } + { + } - void set_normalization(bool value) { normalization = value; } - void set_log_base(double value) { weight = 1./std::log(value); } - - // This must be called if the underlying model is resized. - void resize() { - if (cache_size) - { - cache_keys.resize(m->ngram_size, cache_size); - cache_keys.fill(-1); - } - prop.resize(); - } + void set_normalization(bool value) { normalization = value; } + void set_log_base(double value) { weight = 1./std::log(value); } - void set_width(int width) + // This must be called if the underlying model is resized. + void resize() { + if (cache_size) { - prop.resize(width); + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); } - - template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + prop.resize(); + } + + void set_width(int width) + { + prop.resize(width); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + assert (ngram.rows() == m->ngram_size); + assert (ngram.cols() == 1); + + std::size_t hash; + if (cache_size) { - assert (ngram.rows() == m->ngram_size); - assert (ngram.cols() == 1); - - std::size_t hash; - if (cache_size) - { - // First look in cache - hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h - cache_lookups++; - if (cache_keys.col(hash) == ngram) - { - cache_hits++; - return cache_values[hash]; - } - } - - // Make sure that we're single threaded. Multithreading doesn't help, - // and in some cases can hurt quite a lot - int save_threads = omp_get_max_threads(); - omp_set_num_threads(1); - int save_eigen_threads = Eigen::nbThreads(); - Eigen::setNbThreads(1); - #ifdef __INTEL_MKL__ - int save_mkl_threads = mkl_get_max_threads(); - mkl_set_num_threads(1); - #endif - - prop.fProp(ngram.col(0)); - - int output = ngram(m->ngram_size-1, 0); - double log_prob; - - start_timer(3); - if (normalization) - { - Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - double logz = logsum(scores.col(0)); - log_prob = weight * (scores(output, 0) - logz); - } - else - { - if (prop.skip_hidden) - log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); - else - log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); - } - stop_timer(3); - - if (cache_size) - { - // Update cache - cache_keys.col(hash) = ngram; - cache_values[hash] = log_prob; - } - - #ifdef __INTEL_MKL__ - mkl_set_num_threads(save_mkl_threads); - #endif - Eigen::setNbThreads(save_eigen_threads); - omp_set_num_threads(save_threads); - - return log_prob; + // First look in cache + hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h + cache_lookups++; + if (cache_keys.col(hash) == ngram) + { + cache_hits++; + return cache_values[hash]; + } } - // Look up many n-grams in parallel. - template <typename DerivedA, typename DerivedB> - void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) - { - UNCONST(DerivedB, log_probs_const, log_probs); - assert (ngram.rows() == m->ngram_size); - //assert (ngram.cols() <= prop.get_minibatch_size()); - - prop.fProp(ngram); - - if (normalization) - { - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); - if (prop.skip_hidden) - prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); - else - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - - // And softmax and loss - Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); - double minibatch_log_likelihood; - SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); - for (int j=0; j<ngram.cols(); j++) - { - int output = ngram(m->ngram_size-1, j); - log_probs(0, j) = weight * output_probs(output, j); - } - } - else - { - for (int j=0; j<ngram.cols(); j++) - { - int output = ngram(m->ngram_size-1, j); - if (prop.skip_hidden) - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); - else - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); - } - } - } + // Make sure that we're single threaded. Multithreading doesn't help, + // and in some cases can hurt quite a lot + int save_threads = omp_get_max_threads(); + omp_set_num_threads(1); + int save_eigen_threads = Eigen::nbThreads(); + Eigen::setNbThreads(1); +#ifdef __INTEL_MKL__ + int save_mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); +#endif + + prop.fProp(ngram.col(0)); - int get_order() const { return m->ngram_size; } + int output = ngram(m->ngram_size-1, 0); + double log_prob; - void read(const std::string &filename) + start_timer(3); + if (normalization) { - m->read(filename); - resize(); - // this is faster but takes more memory - //m->premultiply(); + Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + double logz = logsum(scores.col(0)); + log_prob = weight * (scores(output, 0) - logz); } - - void set_cache(std::size_t cache_size) + else { - this->cache_size = cache_size; - cache_keys.resize(m->ngram_size, cache_size); - cache_keys.fill(-1); // clears cache - cache_values.resize(cache_size); - cache_lookups = cache_hits = 0; + if (prop.skip_hidden) + log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); + else + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); } + stop_timer(3); - double cache_hit_rate() + if (cache_size) { - return static_cast<double>(cache_hits)/cache_lookups; + // Update cache + cache_keys.col(hash) = ngram; + cache_values[hash] = log_prob; } - void premultiply() +#ifdef __INTEL_MKL__ + mkl_set_num_threads(save_mkl_threads); +#endif + Eigen::setNbThreads(save_eigen_threads); + omp_set_num_threads(save_threads); + + return log_prob; + } + + // Look up many n-grams in parallel. + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + UNCONST(DerivedB, log_probs_const, log_probs); + assert (ngram.rows() == m->ngram_size); + //assert (ngram.cols() <= prop.get_minibatch_size()); + + prop.fProp(ngram); + + if (normalization) + { + Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + + // And softmax and loss + Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); + double minibatch_log_likelihood; + SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(m->ngram_size-1, j); + log_probs(0, j) = weight * output_probs(output, j); + } + } + else + { + for (int j=0; j<ngram.cols(); j++) + { + int output = ngram(m->ngram_size-1, j); + if (prop.skip_hidden) + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); + else + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + } + } + } + + int get_order() const { return m->ngram_size; } + + void read(const std::string &filename) + { + m->read(filename); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } + + void set_cache(std::size_t cache_size) + { + this->cache_size = cache_size; + cache_keys.resize(m->ngram_size, cache_size); + cache_keys.fill(-1); // clears cache + cache_values.resize(cache_size); + cache_lookups = cache_hits = 0; + } + + double cache_hit_rate() + { + return static_cast<double>(cache_hits)/cache_lookups; + } + + void premultiply() + { + if (!m->premultiplied) { - if (!m->premultiplied) - { - m->premultiply(); - } + m->premultiply(); } + } }; diff --git a/src/neuralTM.h b/src/neuralTM.h index 14bc7bf..9bb6d16 100644 --- a/src/neuralTM.h +++ b/src/neuralTM.h @@ -6,125 +6,139 @@ #include <cstdlib> #include <boost/shared_ptr.hpp> -#include <../3rdparty/Eigen/Dense> +#include <Eigen/Dense> #include "util.h" #include "vocabulary.h" #include "neuralNetwork.h" +#include "replace_digits.hpp" namespace nplm { -class neuralTM : public neuralNetwork +class neuralTM : public neuralNetwork, graehl::replace_digits { - char map_digits; - boost::shared_ptr<vocabulary> input_vocab, output_vocab; - int start, null; + boost::shared_ptr<vocabulary> input_vocab, output_vocab; + int start, null; -public: - neuralTM() + public: + neuralTM() : neuralNetwork(), - map_digits(0), + graehl::replace_digits(0), input_vocab(new vocabulary()), output_vocab(new vocabulary()) - { - } - - void set_map_digits(char value) { map_digits = value; } - - void set_input_vocabulary(const vocabulary &vocab) - { - *(this->input_vocab) = vocab; - start = vocab.lookup_word("<s>"); - null = vocab.lookup_word("<null>"); - } - - void set_output_vocabulary(const vocabulary &vocab) - { - *(this->output_vocab) = vocab; - } - - const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); } - const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); } - - int lookup_input_word(const std::string &word) const - { - if (map_digits) - for (int i=0; i<word.length(); i++) - if (isdigit(word[i])) - { - std::string mapped_word(word); - for (; i<word.length(); i++) - if (isdigit(word[i])) - mapped_word[i] = map_digits; - return input_vocab->lookup_word(mapped_word); - } - return input_vocab->lookup_word(word); - } - - int lookup_output_word(const std::string &word) const - { - if (map_digits) - for (int i=0; i<word.length(); i++) - if (isdigit(word[i])) - { - std::string mapped_word(word); - for (; i<word.length(); i++) - if (isdigit(word[i])) - mapped_word[i] = map_digits; - return output_vocab->lookup_word(mapped_word); - } - return output_vocab->lookup_word(word); - } - - double lookup_ngram(const int *ngram_a, int n) - { - Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); - for (int i=0; i<m->ngram_size; i++) - { - if (i-m->ngram_size+n < 0) - { - if (ngram_a[0] == start) - ngram(i) = start; - else - ngram(i) = null; - } - else - { - ngram(i) = ngram_a[i-m->ngram_size+n]; - } - } - return neuralNetwork::lookup_ngram(ngram); - } - - double lookup_ngram(const std::vector<int> &ngram_v) - { - return lookup_ngram(ngram_v.data(), ngram_v.size()); - } - - template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) - { - return neuralNetwork::lookup_ngram(ngram); - } - - template <typename DerivedA, typename DerivedB> - void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) - { - return neuralNetwork::lookup_ngram(ngram, log_probs_const); - } - - void read(const std::string &filename) + { + } + + void set_map_digits(char value) { map_digits = value; } + + void set_input_vocabulary(const vocabulary &vocab) + { + *(this->input_vocab) = vocab; + start = vocab.lookup_word("<s>"); + null = vocab.lookup_word("<null>"); + } + + void set_output_vocabulary(const vocabulary &vocab) + { + *(this->output_vocab) = vocab; + } + + const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); } + const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); } + + int lookup_word(const std::string &word, vocabulary const& vocab) const + { + if (map_digits) + for (int i=0, n=word.size(); i<n; ++i) + if (graehl::ascii_digit(word[i])) { + std::string mapped_word(word); + replace(mapped_word, i); + return vocab.lookup_word(mapped_word); + } + return vocab.lookup_word(word); + } + + int lookup_word(std::pair<char const*, char const*> slice, vocabulary const& vocab) const + { + if (map_digits) + for (char const* i = slice.first; i != slice.second; ++i) + if (graehl::ascii_digit(*i)) { + std::string mapped_word(slice.first, slice.second); + replace(mapped_word, i - slice.first); + return vocab.lookup_word(mapped_word); + } + return vocab.lookup_word(slice); + } + + int lookup_input_word(const std::string &word) const + { + return lookup_word(word, *input_vocab); + } + + int lookup_output_word(const std::string &word) const + { + return lookup_word(word, *output_vocab); + } + + int lookup_input_word(std::pair<char const*, char const*> word) const + { + return lookup_word(word, *input_vocab); + } + + int lookup_output_word(std::pair<char const*, char const*> word) const + { + return lookup_word(word, *output_vocab); + } + + double lookup_ngram(const int *ngram_a, int n) + { + Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); + for (int i=0; i<m->ngram_size; i++) { - std::vector<std::string> input_words; - std::vector<std::string> output_words; - m->read(filename, input_words, output_words); - set_input_vocabulary(vocabulary(input_words)); - set_output_vocabulary(vocabulary(output_words)); - resize(); - // this is faster but takes more memory - //m->premultiply(); + if (i-m->ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-m->ngram_size+n]; + } } + return neuralNetwork::lookup_ngram(ngram); + } + + double lookup_ngram(const std::vector<int> &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } + + template <typename Derived> + double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + { + return neuralNetwork::lookup_ngram(ngram); + } + + template <typename DerivedA, typename DerivedB> + void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); + } + + void read(const std::string &filename) + { + std::vector<std::string> input_words; + std::vector<std::string> output_words; + m->read(filename, input_words, output_words); + set_input_vocabulary(vocabulary(input_words)); + set_output_vocabulary(vocabulary(output_words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } }; diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp index adedc72..d5fc16b 100644 --- a/src/prepareNeuralLM.cpp +++ b/src/prepareNeuralLM.cpp @@ -2,19 +2,19 @@ #include <vector> #include <queue> #include <deque> -# include <fstream> -# include <iterator> - -# include <boost/unordered_map.hpp> -# include <boost/algorithm/string/join.hpp> -# include <boost/interprocess/managed_shared_memory.hpp> -# include <boost/interprocess/allocators/allocator.hpp> -# include <boost/interprocess/managed_mapped_file.hpp> +#include <fstream> +#include <iterator> + +#include <boost/unordered_map.hpp> +#include <boost/algorithm/string/join.hpp> +#include <boost/interprocess/managed_shared_memory.hpp> +#include <boost/interprocess/allocators/allocator.hpp> +#include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/containers/vector.hpp> #include <boost/random/mersenne_twister.hpp> #include <boost/random/uniform_int_distribution.hpp> -# include <tclap/CmdLine.h> +#include <tclap/CmdLine.h> #include "neuralLM.h" #include "util.h" @@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec; typedef long long int data_size_t; // training data can easily exceed 2G instances template<typename T> -void writeNgrams(const T &data, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename) - { - ofstream file(filename.c_str()); - if (!file) +void writeNgrams(const T &data, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename) +{ + ofstream file(filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + vector<vector<int> > ngrams; + + for (int i=0; i<data.size(); i++) { + preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize); + // write out n-grams + for (int j=0; j<ngrams.size(); j++) { - cerr << "error: could not open " << filename << endl; - exit(1); - } - - vector<vector<int> > ngrams; - - for (int i=0; i<data.size(); i++) { - preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize); - // write out n-grams - for (int j=0; j<ngrams.size(); j++) - { - for (int k=0; k<ngram_size; k++) - { - file << ngrams[j][k] << " "; - } - file << endl; - } + for (int k=0; k<ngram_size; k++) + { + file << ngrams[j][k] << " "; + } + file << endl; } - file.close(); + } + file.close(); } // Space efficient version for writing the n-grams. // They are not read into memory. -void writeNgrams(const string &input_filename, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename, - int train_data_size, - vector<float> &sent_weights, - const string &sent_weights_filename) +void writeNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + int train_data_size, + vector<float> &sent_weights, + const string &sent_weights_filename) { - ofstream file(filename.c_str()); - ofstream output_sent_weights_file(sent_weights_filename.c_str()); - if (!file) - { - cerr << "error: could not open " << filename << endl; - exit(1); + ofstream file(filename.c_str()); + ofstream output_sent_weights_file(sent_weights_filename.c_str()); + if (!file) + { + cerr << "error: could not open " << filename << endl; + exit(1); + } + + ifstream input_file(input_filename.c_str()); + vector<vector<int> > ngrams; + //for (int i=0; i<train_data.size(); i++) { + string line; + int counter = 0; + cerr<<"Processed ... "; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) == 0) { + cerr<<counter<<" training lines ... "; } - - ifstream input_file(input_filename.c_str()); - vector<vector<int> > ngrams; - //for (int i=0; i<train_data.size(); i++) { - string line; - int counter = 0; - cerr<<"Processed ... "; - while (getline(input_file,line) && train_data_size-- > 0) { - counter++; - if ((counter % 100000) == 0) { - cerr<<counter<<" training lines ... "; - } - //stringstream lstr(line); - vector<string> lstr_items; - splitBySpace(line,lstr_items); + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); //for (int i=0; i<data.size(); i++) { - preprocessWords(lstr_items, - ngrams, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize); - - // write out n-grams - for (int j=0; j<ngrams.size(); j++) - { - if (sent_weights.size() != 0) { - output_sent_weights_file <<sent_weights[counter-1]<<endl; - } - for (int k=0; k<ngram_size; k++) - { - file << ngrams[j][k] << " "; - } - file << endl; - } + preprocessWords(lstr_items, + ngrams, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize); + + // write out n-grams + for (int j=0; j<ngrams.size(); j++) + { + if (sent_weights.size() != 0) { + output_sent_weights_file <<sent_weights[counter-1]<<endl; + } + for (int k=0; k<ngram_size; k++) + { + file << ngrams[j][k] << " "; + } + file << endl; } - cerr<<endl; - input_file.close(); - file.close(); - output_sent_weights_file.close(); + } + cerr<<endl; + input_file.close(); + file.close(); + output_sent_weights_file.close(); } // Space efficient version for writing the n-grams. // They are not read into memory. -void writeMmapNgrams(const string &input_filename, - int ngram_size, - const vocabulary &vocab, - bool numberize, - bool add_start_stop, - bool ngramize, - const string &filename, - unsigned long train_data_size, - data_size_t num_tokens, - bool randomize) +void writeMmapNgrams(const string &input_filename, + int ngram_size, + const vocabulary &vocab, + bool numberize, + bool add_start_stop, + bool ngramize, + const string &filename, + unsigned long train_data_size, + data_size_t num_tokens, + bool randomize) { - cerr<<"Num tokens is "<<num_tokens<<endl; - cerr<<"Training data size is "<<train_data_size<<endl; - // Open the memory mapped file and create the allocators - ip::managed_mapped_file mfile(ip::create_only, - filename.c_str(), - num_tokens*ngram_size*sizeof(int)+1024UL*1024UL); - intAllocator ialloc(mfile.get_segment_manager()); - vecAllocator valloc (mfile.get_segment_manager()); - //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc); - - vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc); - - cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl; - // Going over every line in the input file and - // printing the memory mapped ngrams into the - // output file - ifstream input_file(input_filename.c_str()); - //for (int i=0; i<train_data.size(); i++) { - string line; - int counter = 0; - cerr<<"Processed ... "; - long int train_ngram_counter = 0; - vector<vector<int> > ngrams; - while (getline(input_file,line) && train_data_size-- > 0) { - counter++; - if ((counter % 100000) ==0) { - //cerr<<"counter is "<<counter<<endl; - cerr<<counter<<" training lines ... "; - } - //stringstream lstr(line); - vector<string> lstr_items; - splitBySpace(line,lstr_items); + cerr<<"Num tokens is "<<num_tokens<<endl; + cerr<<"Training data size is "<<train_data_size<<endl; + // Open the memory mapped file and create the allocators + ip::managed_mapped_file mfile(ip::create_only, + filename.c_str(), + num_tokens*ngram_size*sizeof(int)+1024UL*1024UL); + intAllocator ialloc(mfile.get_segment_manager()); + vecAllocator valloc (mfile.get_segment_manager()); + //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc); + + vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc); + + cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl; + // Going over every line in the input file and + // printing the memory mapped ngrams into the + // output file + ifstream input_file(input_filename.c_str()); + //for (int i=0; i<train_data.size(); i++) { + string line; + int counter = 0; + cerr<<"Processed ... "; + long int train_ngram_counter = 0; + vector<vector<int> > ngrams; + while (getline(input_file,line) && train_data_size-- > 0) { + counter++; + if ((counter % 100000) ==0) { + //cerr<<"counter is "<<counter<<endl; + cerr<<counter<<" training lines ... "; + } + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); //for (int i=0; i<data.size(); i++) { - preprocessWords(lstr_items, ngrams, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize); - /* + preprocessWords(lstr_items, ngrams, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize); + /* cerr<<"line is "<<endl; cerr<<line<<endl; cerr<<"Number of ngrams is "<<ngrams.size()<<endl; - if (ngrams.size() ==1 ){ - cerr<<"The line number was "<<counter<<endl; - cerr<<line<<endl; + if (ngrams.size() ==1 ){ + cerr<<"The line number was "<<counter<<endl; + cerr<<line<<endl; + } + */ + // write out n-grams in mmapped file + for (int j=0; j<ngrams.size(); j++) + { + /* + for (int k=0; k<ngram_size; k++) + { + cerr << ngrams[j][k] << " "; } + cerr<< endl; */ - // write out n-grams in mmapped file - for (int j=0; j<ngrams.size(); j++) - { - /* - for (int k=0; k<ngram_size; k++) - { - cerr << ngrams[j][k] << " "; - } - cerr<< endl; - */ - for (int k=0; k<ngram_size; k++) { - mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k]; - } - train_ngram_counter++; - //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl; - } + for (int k=0; k<ngram_size; k++) { + mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k]; + } + train_ngram_counter++; + //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl; } - cerr<<endl; - input_file.close(); - - // Shrink the file if it was overused - ip::managed_mapped_file::shrink_to_fit(filename.c_str()); - //now to randomize the items if the randomize flag was set - if (randomize == true) { - unsigned seed = 1234; //for testing only - mt19937 rng(seed); - cerr<<"Randomly shuffling data..."; - data_size_t counter =0; - while (counter < num_tokens) { - data_size_t upper_limit = counter+5000000; - long int vector_size = 5000000; - if (counter + 10000000 >= num_tokens) { - upper_limit = num_tokens; - vector_size = num_tokens - counter; - } - vector<int> temp(vector_size*ngram_size,0); - for (int i=0;i<vector_size;i++){ - for (int k=0;k<ngram_size;k++) { - temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k); - } - } - for (data_size_t i=vector_size-1; i>0; i--) - { - if (i %500000 == 0) { - cerr<<"Shuffled "<<num_tokens-1<<" instances..."; - } - data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); - for (int k=0;k<ngram_size;k++) { - int temp_val = temp.at(i*ngram_size+k); - temp.at(i*ngram_size+k) = - temp.at(j*ngram_size+k); - temp.at(j*ngram_size+k) = temp_val; - } - } - //Putting it back - for (int i=0;i<vector_size;i++){ - for (int k=0;k<ngram_size;k++) { - mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; - } - } - counter = upper_limit; + } + cerr<<endl; + input_file.close(); + + // Shrink the file if it was overused + ip::managed_mapped_file::shrink_to_fit(filename.c_str()); + //now to randomize the items if the randomize flag was set + if (randomize == true) { + unsigned seed = 1234; //for testing only + boost::random::mt19937 rng(seed); + cerr<<"Randomly shuffling data..."; + data_size_t counter =0; + while (counter < num_tokens) { + data_size_t upper_limit = counter+5000000; + long int vector_size = 5000000; + if (counter + 10000000 >= num_tokens) { + upper_limit = num_tokens; + vector_size = num_tokens - counter; + } + vector<int> temp(vector_size*ngram_size,0); + for (int i=0;i<vector_size;i++){ + for (int k=0;k<ngram_size;k++) { + temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k); } - - /* - for (data_size_t i=num_tokens-1; i>0; i--) + } + for (data_size_t i=vector_size-1; i>0; i--) { if (i %500000 == 0) { cerr<<"Shuffled "<<num_tokens-1<<" instances..."; } data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); for (int k=0;k<ngram_size;k++) { - int temp_val = mMapVec->at(i*ngram_size+k); - mMapVec->at(i*ngram_size+k) = - mMapVec->at(j*ngram_size+k); - mMapVec->at(j*ngram_size+k) = temp_val; + int temp_val = temp.at(i*ngram_size+k); + temp.at(i*ngram_size+k) = + temp.at(j*ngram_size+k); + temp.at(j*ngram_size+k) = temp_val; } } - */ - cerr<<endl; + //Putting it back + for (int i=0;i<vector_size;i++){ + for (int k=0;k<ngram_size;k++) { + mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k]; + } + } + counter = upper_limit; } + + /* + for (data_size_t i=num_tokens-1; i>0; i--) + { + if (i %500000 == 0) { + cerr<<"Shuffled "<<num_tokens-1<<" instances..."; + } + data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng); + for (int k=0;k<ngram_size;k++) { + int temp_val = mMapVec->at(i*ngram_size+k); + mMapVec->at(i*ngram_size+k) = + mMapVec->at(j*ngram_size+k); + mMapVec->at(j*ngram_size+k) = temp_val; + } + } + */ + cerr<<endl; + } } int main(int argc, char *argv[]) { - ios::sync_with_stdio(false); - int ngram_size, vocab_size, validation_size; - bool numberize, - ngramize, - add_start_stop, - mmap_file, - randomize; - - string train_text, - train_file, - validation_text, - validation_file, - words_file, - write_words_file, - sent_weights_text, - output_sent_weights_text; - - try - { - CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1"); - - // The options are printed in reverse order + ios::sync_with_stdio(false); + int ngram_size, vocab_size, validation_size; + bool numberize, + ngramize, + add_start_stop, + mmap_file, + randomize; + + string train_text, + train_file, + validation_text, + validation_file, + words_file, + write_words_file, + sent_weights_text, + output_sent_weights_text; + + try + { + CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1"); + + // The options are printed in reverse order ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is " - "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); + "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd); ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd); ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd); ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd); ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd); - ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); + ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd); ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd); - ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); - ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); - ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); - ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); - //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); - //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); - - - - cmd.parse(argc, argv); - - train_text = arg_train_text.getValue(); - train_file = arg_train_file.getValue(); - validation_text = arg_validation_text.getValue(); - validation_file = arg_validation_file.getValue(); - validation_size = arg_validation_size.getValue(); - write_words_file = arg_write_words_file.getValue(); - ngram_size = arg_ngram_size.getValue(); - vocab_size = arg_vocab_size.getValue(); - words_file = arg_words_file.getValue(); - numberize = arg_numberize.getValue(); - ngramize = arg_ngramize.getValue(); - add_start_stop = arg_add_start_stop.getValue(); - mmap_file = arg_mmap_file.getValue(); - randomize = arg_randomize.getValue(); - //sent_weights_text = arg_sent_weights_text.getValue(); - //output_sent_weights_text = arg_sent_weights_file.getValue(); - sent_weights_text = ""; - output_sent_weights_text = ""; + ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd); + ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd); + ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd); + //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd); + //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd); + + + cmd.parse(argc, argv); + + train_text = arg_train_text.getValue(); + train_file = arg_train_file.getValue(); + validation_text = arg_validation_text.getValue(); + validation_file = arg_validation_file.getValue(); + validation_size = arg_validation_size.getValue(); + write_words_file = arg_write_words_file.getValue(); + ngram_size = arg_ngram_size.getValue(); + vocab_size = arg_vocab_size.getValue(); + words_file = arg_words_file.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); + mmap_file = arg_mmap_file.getValue(); + randomize = arg_randomize.getValue(); + //sent_weights_text = arg_sent_weights_text.getValue(); + //output_sent_weights_text = arg_sent_weights_file.getValue(); + sent_weights_text = ""; + output_sent_weights_text = ""; // check command line arguments @@ -364,292 +363,292 @@ int main(int argc, char *argv[]) cerr << "Command line: " << endl; cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; - - const string sep(" Value: "); - cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; - cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; - cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; - cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; - cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; - cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl; - cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; - cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl; - cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl; - cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; - cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; - cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; - cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl; - //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl; - } - catch (TCLAP::ArgException &e) - { - cerr << "error: " << e.error() << " for arg " << e.argId() << endl; - exit(1); - } - // VLF: why is this true? - // DC: it's because the vocabulary has to be constructed from the training data only. - // If the vocabulary is preset, we can't create the validation data. - // - if --numberize 0 is set, then --validation_size cannot be used. - // if (!numberize && (validation_size > 0)) { - // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; - // } - - // Read in training data and validation data - // vector<vector<string> > train_data; - // readSentFile(train_text, train_data); - // @vaswani: No more reading the entire training file into memory - // Reading it per line with file io - - //for (int i=0; i<train_data.size(); i++) { - // Go over every line in the file and - // 1. if the !ngramize then you should check if - // we have the correct number of items per line - // 2. build the vocabulary if the words file has not - // been specified. - // Construct vocabulary - vocabulary vocab; - int start, stop; - // Add start stop if the vocabulary has not been supplied - if (words_file == "") { - vocab.insert_word("<s>"); - vocab.insert_word("</s>"); - vocab.insert_word("<null>"); - // warn user that if --numberize is not set, there will be no vocabulary! - if (!numberize) { - cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; - } - } - if (mmap_file == false && randomize == true) { - cerr<<"Randomize option can only be used with mmap_file = 1"<<endl; - exit(1); + const string sep(" Value: "); + cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl; + cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl; + cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl; + cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl; + cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl; + cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl; + cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl; + cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl; + cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl; + cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl; + //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } + + // VLF: why is this true? + // DC: it's because the vocabulary has to be constructed from the training data only. + // If the vocabulary is preset, we can't create the validation data. + // - if --numberize 0 is set, then --validation_size cannot be used. + // if (!numberize && (validation_size > 0)) { + // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl; + // } + + // Read in training data and validation data + // vector<vector<string> > train_data; + // readSentFile(train_text, train_data); + // @vaswani: No more reading the entire training file into memory + // Reading it per line with file io + + //for (int i=0; i<train_data.size(); i++) { + // Go over every line in the file and + // 1. if the !ngramize then you should check if + // we have the correct number of items per line + // 2. build the vocabulary if the words file has not + // been specified. + // Construct vocabulary + vocabulary vocab; + int start, stop; + // Add start stop if the vocabulary has not been supplied + if (words_file == "") { + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; } - unordered_map<string,int> count; // For keeping word counts if no supplied vocab - - deque<vector<string> > validation_data; - int train_data_size=0; - cerr<<"Processed ... "; - data_size_t num_tokens=0; - - ifstream training(train_text.c_str()); - - string line; - while (getline(training,line)) { - train_data_size++; - //stringstream lstr(line); - vector<string> lstr_items; - splitBySpace(line,lstr_items); - // if data is already ngramized, set/check ngram_size - if (!ngramize) { - if (ngram_size > 0) { - if (ngram_size != lstr_items.size()) { - cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=lstr_items.size(); - } + } + if (mmap_file == false && randomize == true) { + cerr<<"Randomize option can only be used with mmap_file = 1"<<endl; + exit(1); + } + unordered_map<string,int> count; // For keeping word counts if no supplied vocab + + deque<vector<string> > validation_data; + int train_data_size=0; + cerr<<"Processed ... "; + data_size_t num_tokens=0; + + ifstream training(train_text.c_str()); + + string line; + while (getline(training,line)) { + train_data_size++; + //stringstream lstr(line); + vector<string> lstr_items; + splitBySpace(line,lstr_items); + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + if (ngram_size > 0) { + if (ngram_size != lstr_items.size()) { + cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl; + } } - if ((train_data_size%100000)==0){ - cerr<<train_data_size<<" lines ... "; + // else if --ngram_size has not been specified, set it now + else { + ngram_size=lstr_items.size(); } - //break; - /* + } + if ((train_data_size%100000)==0){ + cerr<<train_data_size<<" lines ... "; + } + //break; + /* if (lstr_items.size() ==1) { - cerr<<"line :"<<endl; - cerr<<line<<endl; - cerr<<"The number of items was 1"<<endl; - getchar(); - } - */ - num_tokens += lstr_items.size()+1; - if (words_file == "") { - for (int j=0; j<lstr_items.size(); j++) { - count[lstr_items[j]] += 1; - } + cerr<<"line :"<<endl; + cerr<<line<<endl; + cerr<<"The number of items was 1"<<endl; + getchar(); } - // Add to validation set if the validation size - // has not been specified - if (validation_text == "" && validation_size > 0) { - //cerr<<"validation size is "<<validation_data.size()<<endl; - if (validation_data.size() == validation_size) { - //validation_data.erase(validation_data.begin()); - validation_data.pop_front(); - } - validation_data.push_back(lstr_items); + */ + num_tokens += lstr_items.size()+1; + if (words_file == "") { + for (int j=0; j<lstr_items.size(); j++) { + count[lstr_items[j]] += 1; } } - cerr<<endl; - training.close(); - //cerr<<"validation size is "<<validation_data.size()<<endl; - //getchar(); - if (validation_data.size() < validation_size) { - cerr<<"validation size is "<<validation_data.size()<<endl; - cerr << "error: requested validation size is greater than training data size" << endl; - exit(1); + // Add to validation set if the validation size + // has not been specified + if (validation_text == "" && validation_size > 0) { + //cerr<<"validation size is "<<validation_data.size()<<endl; + if (validation_data.size() == validation_size) { + //validation_data.erase(validation_data.begin()); + validation_data.pop_front(); + } + validation_data.push_back(lstr_items); } - - train_data_size -= validation_size; - cerr<<"Training data size is "<<train_data_size<<endl; - - // The items in the validation data have already been counted - // Decrementing the counts of those words before building the vocabulary - for(int i=0; i<validation_data.size(); i++){ - num_tokens -= (validation_data[i].size() +1); - for (int j=0; j<validation_data[i].size();j++){ - count[validation_data[i][j]] -= 1; - if (count[validation_data[i][j]] == 0) { - count.erase(validation_data[i][j]); - } + } + cerr<<endl; + training.close(); + //cerr<<"validation size is "<<validation_data.size()<<endl; + //getchar(); + if (validation_data.size() < validation_size) { + cerr<<"validation size is "<<validation_data.size()<<endl; + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); + } + + train_data_size -= validation_size; + cerr<<"Training data size is "<<train_data_size<<endl; + + // The items in the validation data have already been counted + // Decrementing the counts of those words before building the vocabulary + for(int i=0; i<validation_data.size(); i++){ + num_tokens -= (validation_data[i].size() +1); + for (int j=0; j<validation_data[i].size();j++){ + count[validation_data[i][j]] -= 1; + if (count[validation_data[i][j]] == 0) { + count.erase(validation_data[i][j]); } } + } - // Getting the top n frequent words for the vocabulary - if (words_file == "") { - vocab.insert_most_frequent(count, vocab_size); - if (vocab.size() < vocab_size) { - cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; - } + // Getting the top n frequent words for the vocabulary + if (words_file == "") { + vocab.insert_most_frequent(count, vocab_size); + if (vocab.size() < vocab_size) { + cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; } - //vector<vector<string> > validation_data; - if (validation_text != "") { - readSentFile(validation_text, validation_data); - for (int i=0; i<validation_data.size(); i++) { - // if data is already ngramized, set/check ngram_size - if (!ngramize) { - // if --ngram_size has been specified, check that it does not conflict with --ngram_size - if (ngram_size > 0) { - if (ngram_size != validation_data[i].size()) { - cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; - } - } - // else if --ngram_size has not been specified, set it now - else { - ngram_size=validation_data[i].size(); - } - } + } + //vector<vector<string> > validation_data; + if (validation_text != "") { + readSentFile(validation_text, validation_data); + for (int i=0; i<validation_data.size(); i++) { + // if data is already ngramized, set/check ngram_size + if (!ngramize) { + // if --ngram_size has been specified, check that it does not conflict with --ngram_size + if (ngram_size > 0) { + if (ngram_size != validation_data[i].size()) { + cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl; + } } + // else if --ngram_size has not been specified, set it now + else { + ngram_size=validation_data[i].size(); + } + } } - //READING SENTENCE WEIGHTS IF THERE ARE ANY - vector<float> sent_weights; - if (sent_weights_text != "") { - cerr<<"Reading sentence weights from "<<sent_weights_text<<endl; - ifstream sent_weights_file(sent_weights_text.c_str()); - string line; - readWeightsFile(sent_weights_file,sent_weights); - sent_weights_file.close(); - if (sent_weights_text.size() != train_data_size) { - cerr<<"The number of sentence weights does not match the number of training sentences"<<endl; - } + } + //READING SENTENCE WEIGHTS IF THERE ARE ANY + vector<float> sent_weights; + if (sent_weights_text != "") { + cerr<<"Reading sentence weights from "<<sent_weights_text<<endl; + ifstream sent_weights_file(sent_weights_text.c_str()); + string line; + readWeightsFile(sent_weights_file,sent_weights); + sent_weights_file.close(); + if (sent_weights_text.size() != train_data_size) { + cerr<<"The number of sentence weights does not match the number of training sentences"<<endl; } - - /* + } + + /* else if (validation_size > 0) { - // Create validation data - if (validation_size > train_data.size()) - { - cerr << "error: requested validation size is greater than training data size" << endl; - exit(1); - } - validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); - train_data.resize(train_data.size() - validation_size); + // Create validation data + if (validation_size > train_data.size()) + { + cerr << "error: requested validation size is greater than training data size" << endl; + exit(1); } - */ - - // Construct vocabulary - //vocabulary vocab; - //int start, stop; - - // read vocabulary from file - if (words_file != "") { - vector<string> words; - readWordsFile(words_file,words); - for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { - vocab.insert_word(*it); - } - - // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file - if (vocab_size > 0) { - if (vocab.size() != vocab_size) { - cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; - } - } - // else, set it to the size of vocabulary read from file - else { - vocab_size = vocab.size(); - } - + validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end()); + train_data.resize(train_data.size() - validation_size); } - /* - // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk> - else { - vocab.insert_word("<s>"); - vocab.insert_word("</s>"); - vocab.insert_word("<null>"); - - // warn user that if --numberize is not set, there will be no vocabulary! - if (!numberize) { - cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; - } - unordered_map<string,int> count; - for (int i=0; i<train_data.size(); i++) { - for (int j=0; j<train_data[i].size(); j++) { - count[train_data[i][j]] += 1; - } - } - - vocab.insert_most_frequent(count, vocab_size); - if (vocab.size() < vocab_size) { - cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; - } + */ + + // Construct vocabulary + //vocabulary vocab; + //int start, stop; + + // read vocabulary from file + if (words_file != "") { + vector<string> words; + readWordsFile(words_file,words); + for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) { + vocab.insert_word(*it); } - */ - // write vocabulary to file - if (write_words_file != "") { - cerr << "Writing vocabulary to " << write_words_file << endl; - writeWordsFile(vocab.words(), write_words_file); + // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file + if (vocab_size > 0) { + if (vocab.size() != vocab_size) { + cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl; + } } - - // Write out numberized n-grams - if (train_file != "") - { - cerr << "Writing training data to " << train_file << endl; - if (mmap_file == true) { - writeMmapNgrams(train_text, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - train_file, - train_data_size, - num_tokens, - randomize); - } else { - writeNgrams(train_text, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - train_file, - train_data_size, - sent_weights, - output_sent_weights_text); - } + // else, set it to the size of vocabulary read from file + else { + vocab_size = vocab.size(); } - if (validation_file != "") - { - cerr << "Writing validation data to " << validation_file << endl; - writeNgrams(validation_data, - ngram_size, - vocab, - numberize, - add_start_stop, - ngramize, - validation_file); + + } + /* + // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk> + else { + vocab.insert_word("<s>"); + vocab.insert_word("</s>"); + vocab.insert_word("<null>"); + + // warn user that if --numberize is not set, there will be no vocabulary! + if (!numberize) { + cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl; + } + unordered_map<string,int> count; + for (int i=0; i<train_data.size(); i++) { + for (int j=0; j<train_data[i].size(); j++) { + count[train_data[i][j]] += 1; + } + } + + vocab.insert_most_frequent(count, vocab_size); + if (vocab.size() < vocab_size) { + cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl; + } + } + */ + + // write vocabulary to file + if (write_words_file != "") { + cerr << "Writing vocabulary to " << write_words_file << endl; + writeWordsFile(vocab.words(), write_words_file); + } + + // Write out numberized n-grams + if (train_file != "") + { + cerr << "Writing training data to " << train_file << endl; + if (mmap_file == true) { + writeMmapNgrams(train_text, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + train_file, + train_data_size, + num_tokens, + randomize); + } else { + writeNgrams(train_text, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + train_file, + train_data_size, + sent_weights, + output_sent_weights_text); } + } + if (validation_file != "") + { + cerr << "Writing validation data to " << validation_file << endl; + writeNgrams(validation_data, + ngram_size, + vocab, + numberize, + add_start_stop, + ngramize, + validation_file); + } } diff --git a/src/propagator.h b/src/propagator.h index 9f214de..6344f2f 100644 --- a/src/propagator.h +++ b/src/propagator.h @@ -13,360 +13,359 @@ using Eigen::MatrixBase; using Eigen::Dynamic; class propagator { - int minibatch_size; - model *pnn; - -public: - Node<Input_word_embeddings> input_layer_node; - Node<Linear_layer> first_hidden_linear_node; - Node<Activation_function> first_hidden_activation_node; - Node<Linear_layer> second_hidden_linear_node; - Node<Activation_function> second_hidden_activation_node; - Node<Output_word_embeddings> output_layer_node; - bool skip_hidden; - -public: - propagator () : minibatch_size(0), pnn(0) { } - - propagator (model &nn, int minibatch_size) + int minibatch_size; + model *pnn; + + public: + Node<Input_word_embeddings> input_layer_node; + Node<Linear_layer> first_hidden_linear_node; + Node<Activation_function> first_hidden_activation_node; + Node<Linear_layer> second_hidden_linear_node; + Node<Activation_function> second_hidden_activation_node; + Node<Output_word_embeddings> output_layer_node; + bool skip_hidden; + + public: + propagator () : minibatch_size(0), pnn(0) { } + + propagator (model &nn, int minibatch_size) : - pnn(&nn), - input_layer_node(&nn.input_layer, minibatch_size), - first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), - first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), - second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), - second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), - output_layer_node(&nn.output_layer, minibatch_size), - minibatch_size(minibatch_size) - { - skip_hidden = (nn.num_hidden == 0); - } + pnn(&nn), + input_layer_node(&nn.input_layer, minibatch_size), + first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size), + first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size), + second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size), + second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size), + output_layer_node(&nn.output_layer, minibatch_size), + minibatch_size(minibatch_size) + { + skip_hidden = (nn.num_hidden == 0); + } - // This must be called if the underlying model is resized. - void resize(int minibatch_size) { - this->minibatch_size = minibatch_size; - input_layer_node.resize(minibatch_size); - first_hidden_linear_node.resize(minibatch_size); - first_hidden_activation_node.resize(minibatch_size); - second_hidden_linear_node.resize(minibatch_size); - second_hidden_activation_node.resize(minibatch_size); - output_layer_node.resize(minibatch_size); - } + // This must be called if the underlying model is resized. + void resize(int minibatch_size) { + this->minibatch_size = minibatch_size; + input_layer_node.resize(minibatch_size); + first_hidden_linear_node.resize(minibatch_size); + first_hidden_activation_node.resize(minibatch_size); + second_hidden_linear_node.resize(minibatch_size); + second_hidden_activation_node.resize(minibatch_size); + output_layer_node.resize(minibatch_size); + } - void resize() { resize(minibatch_size); } + void resize() { resize(minibatch_size); } - template <typename Derived> - void fProp(const MatrixBase<Derived> &data) + template <typename Derived> + void fProp(const MatrixBase<Derived> &data) + { + if (!pnn->premultiplied) { - if (!pnn->premultiplied) - { - start_timer(0); - input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); - stop_timer(0); - - start_timer(1); - first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, - first_hidden_linear_node.fProp_matrix); - } - else - { - int n_inputs = first_hidden_linear_node.param->n_inputs(); - USCMatrix<double> sparse_data; - input_layer_node.param->munge(data, sparse_data); - - start_timer(1); - first_hidden_linear_node.param->fProp(sparse_data, - first_hidden_linear_node.fProp_matrix); - } - first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); - //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl; - //std::getchar(); - stop_timer(1); - - - if (!skip_hidden) { - start_timer(2); - second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, - second_hidden_linear_node.fProp_matrix); - second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); - stop_timer(2); - } - - // The propagation stops here because the last layer is very expensive. - } + start_timer(0); + input_layer_node.param->fProp(data, input_layer_node.fProp_matrix); + stop_timer(0); - // Dense version (for standard log-likelihood) - template <typename DerivedIn, typename DerivedOut> - void bProp(const MatrixBase<DerivedIn> &data, - const MatrixBase<DerivedOut> &output, - double learning_rate, - double momentum, - double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) + start_timer(1); + first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix, + first_hidden_linear_node.fProp_matrix); + } + else { - // Output embedding layer - - start_timer(7); - output_layer_node.param->bProp(output, - output_layer_node.bProp_matrix); - stop_timer(7); - - start_timer(8); - Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; - if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, - output, - learning_rate, - momentum); - } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, - output, - learning_rate); - } else if (parameter_update == "ADAD") { - //std::cerr<<"Adadelta gradient"<<endl; - int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); - output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, - output, - 1.0/current_minibatch_size, - conditioning_constant, - decay); - } else { - std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; - } - stop_timer(8); - - bPropRest(data, - learning_rate, - momentum, - L2_reg, - parameter_update, - conditioning_constant, - decay); + int n_inputs = first_hidden_linear_node.param->n_inputs(); + USCMatrix<double> sparse_data; + input_layer_node.param->munge(data, sparse_data); + + start_timer(1); + first_hidden_linear_node.param->fProp(sparse_data, + first_hidden_linear_node.fProp_matrix); + } + first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl; + //std::getchar(); + stop_timer(1); + + + if (!skip_hidden) { + start_timer(2); + second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, + second_hidden_linear_node.fProp_matrix); + second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + stop_timer(2); } - // Sparse version (for NCE log-likelihood) - template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> - void bProp(const MatrixBase<DerivedIn> &data, - const MatrixBase<DerivedOutI> &samples, - const MatrixBase<DerivedOutV> &weights, - double learning_rate, - double momentum, - double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) - { + // The propagation stops here because the last layer is very expensive. + } + + // Dense version (for standard log-likelihood) + template <typename DerivedIn, typename DerivedOut> + void bProp(const MatrixBase<DerivedIn> &data, + const MatrixBase<DerivedOut> &output, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) + { + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(output, + output_layer_node.bProp_matrix); + stop_timer(7); + + start_timer(8); + Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, + output, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, + output, + learning_rate); + } else if (parameter_update == "ADAD") { + //std::cerr<<"Adadelta gradient"<<endl; + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); + output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, + output, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + } + stop_timer(8); - // Output embedding layer - - start_timer(7); - output_layer_node.param->bProp(samples, - weights, - output_layer_node.bProp_matrix); - stop_timer(7); - - - start_timer(8); - Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; - if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, - samples, - weights, - learning_rate, - momentum); - } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, - samples, - weights, - learning_rate); - } else if (parameter_update == "ADAD") { - int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); - //std::cerr<<"Adadelta gradient"<<endl; - output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, - samples, - weights, - 1.0/current_minibatch_size, - conditioning_constant, - decay); - } else { - std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); } - stop_timer(8); + // Sparse version (for NCE log-likelihood) + template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> + void bProp(const MatrixBase<DerivedIn> &data, + const MatrixBase<DerivedOutI> &samples, + const MatrixBase<DerivedOutV> &weights, + double learning_rate, + double momentum, + double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) + { - bPropRest(data, - learning_rate, - momentum, - L2_reg, - parameter_update, - conditioning_constant, - decay); + // Output embedding layer + + start_timer(7); + output_layer_node.param->bProp(samples, + weights, + output_layer_node.bProp_matrix); + stop_timer(7); + + + start_timer(8); + Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; + if (parameter_update == "SGD") { + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate, + momentum); + } else if (parameter_update == "ADA") { + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, + samples, + weights, + learning_rate); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<<endl; + output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, + samples, + weights, + 1.0/current_minibatch_size, + conditioning_constant, + decay); + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; } -private: - template <typename DerivedIn> - void bPropRest(const MatrixBase<DerivedIn> &data, - double learning_rate, double momentum, double L2_reg, - std::string ¶meter_update, - double conditioning_constant, - double decay) - { - // Second hidden layer + stop_timer(8); + bPropRest(data, + learning_rate, + momentum, + L2_reg, + parameter_update, + conditioning_constant, + decay); + } - - // All the compute gradient functions are together and the backprop - // functions are together - ////////BACKPROP//////////// - start_timer(9); - if (skip_hidden) + private: + template <typename DerivedIn> + void bPropRest(const MatrixBase<DerivedIn> &data, + double learning_rate, double momentum, double L2_reg, + std::string ¶meter_update, + double conditioning_constant, + double decay) { - start_timer(9); - first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + // Second hidden layer + + + + // All the compute gradient functions are together and the backprop + // functions are together + ////////BACKPROP//////////// + start_timer(9); + if (skip_hidden) + { + start_timer(9); + first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, first_hidden_activation_node.bProp_matrix, first_hidden_linear_node.fProp_matrix, first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(9); + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(9); - } - else - { - second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, - second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); + } + else + { + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); - second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.bProp_matrix); - stop_timer(9); + second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.bProp_matrix); + stop_timer(9); - start_timer(11); - first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, - first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); + start_timer(11); + first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(11); - } - //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl; - //std::getchar(); - ////COMPUTE GRADIENT///////// - if (parameter_update == "SGD") { - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, - momentum, - L2_reg); - stop_timer(10); + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(11); } - - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - learning_rate, momentum, L2_reg); - stop_timer(12); - - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, - data, - learning_rate, momentum, L2_reg); - stop_timer(13); - } else if (parameter_update == "ADA") { - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, - L2_reg); - stop_timer(10); + //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl; + //std::getchar(); + ////COMPUTE GRADIENT///////// + if (parameter_update == "SGD") { + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + momentum, + L2_reg); + stop_timer(10); + } + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, momentum, L2_reg); + stop_timer(13); + } else if (parameter_update == "ADA") { + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(10); + } + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + learning_rate, + L2_reg); + stop_timer(12); + + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix, + data, + learning_rate, + L2_reg); + stop_timer(13); + } else if (parameter_update == "ADAD") { + int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); + //std::cerr<<"Adadelta gradient"<<endl; + if (!skip_hidden) + { + start_timer(10); + second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(10); + } + //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl; + + // First hidden layer + + + start_timer(12); + first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix, + input_layer_node.fProp_matrix, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(12); + + //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl; + // Input word embeddings + + start_timer(13); + input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix, + data, + 1.0/current_minibatch_size, + L2_reg, + conditioning_constant, + decay); + stop_timer(13); + + //std::cerr<<"Finished gradient for first input layer"<<std::endl; + } else { + std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; } - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - learning_rate, - L2_reg); - stop_timer(12); - - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix, - data, - learning_rate, - L2_reg); - stop_timer(13); - } else if (parameter_update == "ADAD") { - int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); - //std::cerr<<"Adadelta gradient"<<endl; - if (!skip_hidden) - { - start_timer(10); - second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - 1.0/current_minibatch_size, - L2_reg, - conditioning_constant, - decay); - stop_timer(10); - } - //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl; - - // First hidden layer - - - start_timer(12); - first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix, - input_layer_node.fProp_matrix, - 1.0/current_minibatch_size, - L2_reg, - conditioning_constant, - decay); - stop_timer(12); - - //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl; - // Input word embeddings - - start_timer(13); - input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix, - data, - 1.0/current_minibatch_size, - L2_reg, - conditioning_constant, - decay); - stop_timer(13); - - //std::cerr<<"Finished gradient for first input layer"<<std::endl; - } else { - std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl; } - - } }; } // namespace nplm #endif - diff --git a/src/replace_digits.hpp b/src/replace_digits.hpp new file mode 100644 index 0000000..e8ac957 --- /dev/null +++ b/src/replace_digits.hpp @@ -0,0 +1,62 @@ +/** \file \author Jonathan Graehl <graehl@gmail.com> + + replace 0-9 ascii chars with another ascii replacement + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. +*/ + +#ifndef REPLACEDIGITS_GRAEHL_2015_06_25_H +#define REPLACEDIGITS_GRAEHL_2015_06_25_H +#pragma once + +#include <string> +#include <utility> + +namespace graehl { + +inline bool ascii_digit(char c) { + return c >= '0' && c <= '9'; +} + +struct replace_digits { + char map_digits; + replace_digits(char map_digits = '@') : map_digits(map_digits) {} + + /// \return whether anything was replaced + bool replaced(char* i, char* end) const { + for (; i != end; ++i) + if (ascii_digit(*i)) { + *i = map_digits; + while (++i != end) + if (ascii_digit(*i)) *i = map_digits; + return true; + } + return false; + } + /// maybe: only if non-0 map_digits, do the thing + bool maybe_replaced(char* i, char* end) const { return map_digits && replaced(i, end); } + + void replace(char* i, char* end) const { + for (; i != end; ++i) + if (ascii_digit(*i)) *i = map_digits; + } + void maybe_replace(char* i, char* end) const { + if (map_digits) replace(i, end); + } + + void replace(std::string& str, std::string::size_type i = 0) const { + std::string::size_type n = str.size(); + char* d = (char *)str.data(); // although only C++11 officially allows this, in reality everyone does + replace(d + i, d + n); + } + void maybe_replace(std::string& str, std::string::size_type i = 0) const { + if (map_digits) replace(str, i); + } +}; + + +} + +#endif diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp index 4f3713d..abaab34 100644 --- a/src/testNeuralLM.cpp +++ b/src/testNeuralLM.cpp @@ -6,7 +6,6 @@ #include <tclap/CmdLine.h> #include <Eigen/Core> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "param.h" @@ -21,174 +20,174 @@ using namespace Eigen; using namespace nplm; void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams, - vector<double> &out) { - if (ngrams.size() == 0) return; - int ngram_size = ngrams[0].size(); - - if (minibatch_size == 0) + vector<double> &out) { + if (ngrams.size() == 0) return; + int ngram_size = ngrams[0].size(); + + if (minibatch_size == 0) + { + // Score one n-gram at a time. This is how the LM would be queried from a decoder. + for (int sent_id=0; sent_id<start.size()-1; sent_id++) { - // Score one n-gram at a time. This is how the LM would be queried from a decoder. - for (int sent_id=0; sent_id<start.size()-1; sent_id++) - { - double sent_log_prob = 0.0; - for (int j=start[sent_id]; j<start[sent_id+1]; j++) - sent_log_prob += lm.lookup_ngram(ngrams[j]); - out.push_back(sent_log_prob); - } + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += lm.lookup_ngram(ngrams[j]); + out.push_back(sent_log_prob); } - else + } + else + { + // Score a whole minibatch at a time. + Matrix<double,1,Dynamic> log_probs(ngrams.size()); + + Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); + minibatch.setZero(); + for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) { - // Score a whole minibatch at a time. - Matrix<double,1,Dynamic> log_probs(ngrams.size()); - - Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); - minibatch.setZero(); - for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size) - { - int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; - for (int j=0; j<current_minibatch_size; j++) - minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); - lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); - } - - for (int sent_id=0; sent_id<start.size()-1; sent_id++) - { - double sent_log_prob = 0.0; - for (int j=start[sent_id]; j<start[sent_id+1]; j++) - sent_log_prob += log_probs[j]; - out.push_back(sent_log_prob); - } + int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id; + for (int j=0; j<current_minibatch_size; j++) + minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size); + lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size)); } + + for (int sent_id=0; sent_id<start.size()-1; sent_id++) + { + double sent_log_prob = 0.0; + for (int j=start[sent_id]; j<start[sent_id+1]; j++) + sent_log_prob += log_probs[j]; + out.push_back(sent_log_prob); + } + } } -int main (int argc, char *argv[]) +int main (int argc, char *argv[]) { - param myParam; - bool normalization; - bool numberize, ngramize, add_start_stop; + param myParam; + bool normalization; + bool numberize, ngramize, add_start_stop; - try { - // program options // - CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1"); + try { + // program options // + CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1"); - ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); - ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd); + ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); + ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd); - ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd); + ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd); - ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); + ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); - ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd); + ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd); - ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd); + ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd); - cmd.parse(argc, argv); + cmd.parse(argc, argv); - myParam.model_file = arg_model_file.getValue(); - myParam.test_file = arg_test_file.getValue(); + myParam.model_file = arg_model_file.getValue(); + myParam.test_file = arg_test_file.getValue(); - normalization = arg_normalization.getValue(); - numberize = arg_numberize.getValue(); - ngramize = arg_ngramize.getValue(); - add_start_stop = arg_add_start_stop.getValue(); + normalization = arg_normalization.getValue(); + numberize = arg_numberize.getValue(); + ngramize = arg_ngramize.getValue(); + add_start_stop = arg_add_start_stop.getValue(); - myParam.minibatch_size = minibatch_size.getValue(); - myParam.num_threads = num_threads.getValue(); + myParam.minibatch_size = minibatch_size.getValue(); + myParam.num_threads = num_threads.getValue(); - cerr << "Command line: " << endl; - cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; - - const string sep(" Value: "); - cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl; - cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl; + cerr << "Command line: " << endl; + cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl; - cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl; - cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; - cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; + const string sep(" Value: "); + cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl; + cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl; - cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; - cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; - } - catch (TCLAP::ArgException &e) - { - cerr << "error: " << e.error() << " for arg " << e.argId() << endl; - exit(1); - } + cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl; + cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl; + cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl; - myParam.num_threads = setup_threads(myParam.num_threads); + cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; + cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl; + } + catch (TCLAP::ArgException &e) + { + cerr << "error: " << e.error() << " for arg " << e.argId() << endl; + exit(1); + } - ///// Create language model + myParam.num_threads = setup_threads(myParam.num_threads); - neuralLM lm; - lm.read(myParam.model_file); - lm.set_normalization(normalization); - lm.set_log_base(10); - lm.set_cache(1048576); - int ngram_size = lm.get_order(); - int minibatch_size = myParam.minibatch_size; - if (minibatch_size) - lm.set_width(minibatch_size); + ///// Create language model - ///// Read test data - - ifstream test_file(myParam.test_file.c_str()); - if (!test_file) - { - cerr << "error: could not open " << myParam.test_file << endl; - exit(1); - } - string line; + neuralLM lm; + lm.read(myParam.model_file); + lm.set_normalization(normalization); + lm.set_log_base(10); + lm.set_cache(1048576); + int ngram_size = lm.get_order(); + int minibatch_size = myParam.minibatch_size; + if (minibatch_size) + lm.set_width(minibatch_size); - vector<int> start; - vector<vector<int> > ngrams; + ///// Read test data - while (getline(test_file, line)) - { - vector<string> words; - splitBySpace(line, words); + ifstream test_file(myParam.test_file.c_str()); + if (!test_file) + { + cerr << "error: could not open " << myParam.test_file << endl; + exit(1); + } + string line; - vector<vector<int> > sent_ngrams; - preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize); + vector<int> start; + vector<vector<int> > ngrams; - start.push_back(ngrams.size()); - copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams)); - } - start.push_back(ngrams.size()); + while (getline(test_file, line)) + { + vector<string> words; + splitBySpace(line, words); - int num_threads = 1; - vector< vector<double> > sent_log_probs(num_threads); + vector<vector<int> > sent_ngrams; + preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize); - /* - // Test thread safety - boost::thread_group tg; - for (int t=0; t < num_threads; t++) { - tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm - } - tg.join_all(); - */ - score(lm, minibatch_size, start, ngrams, sent_log_probs[0]); - - vector<double> log_likelihood(num_threads); - std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0); - for (int i=0; i<sent_log_probs[0].size(); i++) { - for (int t=0; t<num_threads; t++) - cout << sent_log_probs[t][i] << "\t"; - cout << endl; - for (int t=0; t<num_threads; t++) - log_likelihood[t] += sent_log_probs[t][i]; - } - - cerr << "Test log10-likelihood: "; + start.push_back(ngrams.size()); + copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams)); + } + start.push_back(ngrams.size()); + + int num_threads = 1; + vector< vector<double> > sent_log_probs(num_threads); + + /* + // Test thread safety + boost::thread_group tg; + for (int t=0; t < num_threads; t++) { + tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm + } + tg.join_all(); + */ + score(lm, minibatch_size, start, ngrams, sent_log_probs[0]); + + vector<double> log_likelihood(num_threads); + std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0); + for (int i=0; i<sent_log_probs[0].size(); i++) { for (int t=0; t<num_threads; t++) - cerr << log_likelihood[t] << " "; - cerr << endl; - #ifdef USE_CHRONO - cerr << "Propagation times:"; - for (int i=0; i<timer.size(); i++) - cerr << " " << timer.get(i); - cerr << endl; - #endif - + cout << sent_log_probs[t][i] << "\t"; + cout << endl; + for (int t=0; t<num_threads; t++) + log_likelihood[t] += sent_log_probs[t][i]; + } + + cerr << "Test log10-likelihood: "; + for (int t=0; t<num_threads; t++) + cerr << log_likelihood[t] << " "; + cerr << endl; +#ifdef USE_CHRONO + cerr << "Propagation times:"; + for (int i=0; i<timer.size(); i++) + cerr << " " << timer.get(i); + cerr << endl; +#endif + } diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp index 97af03b..d4720ef 100644 --- a/src/trainNeuralNetwork.cpp +++ b/src/trainNeuralNetwork.cpp @@ -6,17 +6,16 @@ #include <vector> #include <algorithm> -#include <boost/unordered_map.hpp> +#include <boost/unordered_map.hpp> #include <boost/functional.hpp> #include <boost/lexical_cast.hpp> #include <boost/random/mersenne_twister.hpp> #include <boost/algorithm/string/join.hpp> -# include <boost/interprocess/managed_shared_memory.hpp> -# include <boost/interprocess/allocators/allocator.hpp> -# include <boost/interprocess/managed_mapped_file.hpp> +#include <boost/interprocess/managed_shared_memory.hpp> +#include <boost/interprocess/allocators/allocator.hpp> +#include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/containers/vector.hpp> -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include <Eigen/Sparse> #include "maybe_omp.h" @@ -29,7 +28,6 @@ #include "graphClasses.h" #include "util.h" #include "multinomial.h" -//#include "gradientCheck.h" //#define EIGEN_DONT_PARALLELIZE @@ -65,7 +63,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va int validation_minibatch_start_index = validation_minibatch_size * validation_batch; int current_minibatch_size = min(validation_minibatch_size, validation_data_size - validation_minibatch_start_index); - minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, + minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index, current_minibatch_size); prop_validation.fProp(minibatch.topRows(ngram_size-1)); @@ -80,7 +78,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va // And softmax and loss. Be careful of short minibatch double minibatch_log_likelihood; start_timer(5); - SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), minibatch.row(ngram_size-1), output_probs, minibatch_log_likelihood); @@ -93,7 +91,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va // If the validation perplexity decreases, halve the learning rate. if (current_validation_ll != 0.0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA") - { + { current_learning_rate /= 2; } current_validation_ll = log_likelihood; @@ -101,7 +99,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va int main(int argc, char** argv) -{ +{ ios::sync_with_stdio(false); bool use_mmap_file, randomize; param myParam; @@ -183,7 +181,7 @@ int main(int argc, char** argv) myParam.input_words_file = input_words_file.getValue(); myParam.output_words_file = output_words_file.getValue(); if (words_file.getValue() != "") - myParam.input_words_file = myParam.output_words_file = words_file.getValue(); + myParam.input_words_file = myParam.output_words_file = words_file.getValue(); myParam.model_prefix = model_prefix.getValue(); @@ -192,7 +190,7 @@ int main(int argc, char** argv) myParam.input_vocab_size = input_vocab_size.getValue(); myParam.output_vocab_size = output_vocab_size.getValue(); if (vocab_size.getValue() > 0) { - myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); + myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue(); } myParam.num_hidden = num_hidden.getValue(); myParam.activation_function = activation_function.getValue(); @@ -205,7 +203,7 @@ int main(int argc, char** argv) myParam.input_embedding_dimension = input_embedding_dimension.getValue(); myParam.output_embedding_dimension = output_embedding_dimension.getValue(); if (embedding_dimension.getValue() >= 0) { - myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue(); + myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue(); } myParam.minibatch_size = minibatch_size.getValue(); @@ -243,33 +241,33 @@ int main(int argc, char** argv) if (embedding_dimension.getValue() >= 0) { - cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; + cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl; } else { - cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; - cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; + cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl; + cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl; } cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl; if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue()) { - cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; - exit(1); + cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl; + exit(1); } cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl; if (string_to_activation_function(activation_function.getValue()) == InvalidFunction) { - cerr << "error: invalid activation function: " << activation_function.getValue() << endl; - exit(1); + cerr << "error: invalid activation function: " << activation_function.getValue() << endl; + exit(1); } cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl; if (string_to_loss_function(loss_function.getValue()) == InvalidLoss) { - cerr << "error: invalid loss function: " << loss_function.getValue() << endl; - exit(1); + cerr << "error: invalid loss function: " << loss_function.getValue() << endl; + exit(1); } cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl; @@ -279,7 +277,7 @@ int main(int argc, char** argv) cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl; cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl; if (myParam.validation_file != "") { - cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; + cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl; } cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl; cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl; @@ -288,7 +286,7 @@ int main(int argc, char** argv) cerr << normalization.getDescription() << sep << normalization.getValue() << endl; if (myParam.normalization){ - cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; + cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl; } cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl; @@ -302,7 +300,7 @@ int main(int argc, char** argv) if (unigram_probs_file.getValue() != "") { - cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; + cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl; } } catch (TCLAP::ArgException &e) @@ -316,7 +314,7 @@ int main(int argc, char** argv) //unsigned seed = std::time(0); unsigned seed = 1234; //for testing only - mt19937 rng(seed); + boost::random::mt19937 rng(seed); /////////////////////////READING IN THE TRAINING AND VALIDATION DATA/////////////////// ///////////////////////////////////////////////////////////////////////////////////// @@ -337,7 +335,7 @@ int main(int argc, char** argv) training_data_flat_mmap = mmap_file.find<vec>("vector").first; cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl; training_data_size = training_data_flat_mmap->size()/myParam.ngram_size; - //randomly shuffle the data for better learning. The shuffling will + //randomly shuffle the data for better learning. The shuffling will //be different for a standard stl vector // Randomly shuffle training data to improve learning if (randomize == true) { @@ -413,10 +411,10 @@ int main(int argc, char** argv) //cerr<<"Num tokens "<<num_tokens<<endl; //data_size_t training_data_size = num_tokens / myParam.ngram_size; cerr << "Number of training instances: "<< training_data_size << endl; - + Matrix<int,Dynamic,Dynamic> training_data; //(training_data_flat.data(), myParam.ngram_size, training_data_size); - + #ifdef MAP cerr<<"Setting up eigen map"<<endl; if (use_mmap_file == false) { @@ -425,11 +423,11 @@ int main(int argc, char** argv) training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size); } cerr<<"Created eigen map"<<endl; - #else + #else if (use_mmap_file == false) { training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size); } - #endif + #endif // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index if (myParam.input_vocab_size == 0 and myParam.input_words_file == "") { @@ -454,7 +452,7 @@ int main(int argc, char** argv) // Read validation data vector<int> validation_data_flat; int validation_data_size = 0; - + if (myParam.validation_file != "") { readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat); @@ -470,16 +468,16 @@ int main(int argc, char** argv) if (myParam.input_words_file != "") { readWordsFile(myParam.input_words_file, input_words); - if (myParam.input_vocab_size == 0) - myParam.input_vocab_size = input_words.size(); + if (myParam.input_vocab_size == 0) + myParam.input_vocab_size = input_words.size(); } vector<string> output_words; if (myParam.output_words_file != "") { readWordsFile(myParam.output_words_file, output_words); - if (myParam.output_vocab_size == 0) - myParam.output_vocab_size = output_words.size(); + if (myParam.output_vocab_size == 0) + myParam.output_vocab_size = output_words.size(); } ///// Construct unigram model and sampler that will be used for NCE @@ -491,17 +489,17 @@ int main(int argc, char** argv) if (use_mmap_file == false) { output_word = training_data(myParam.ngram_size-1, train_id); } else { - //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl; + //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl; output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1); } - //cerr<<"output word is "<<output_word<<endl; - unigram_counts[output_word] += 1; + //cerr<<"output word is "<<output_word<<endl; + unigram_counts[output_word] += 1; } multinomial<data_size_t> unigram (unigram_counts); ///// Create and initialize the neural network and associated propagators. model nn; - // IF THE MODEL FILE HAS BEEN DEFINED, THEN + // IF THE MODEL FILE HAS BEEN DEFINED, THEN // LOAD THE NEURAL NETWORK MODEL if (myParam.model_file != ""){ nn.read(myParam.model_file); @@ -529,7 +527,7 @@ int main(int argc, char** argv) SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram); // normalization parameters vector_map c_h, c_h_running_gradient; - + ///////////////////////TRAINING THE NEURAL NETWORK//////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// @@ -540,8 +538,8 @@ int main(int argc, char** argv) if (validation_data_size > 0) { num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1; - cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl; - } + cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl; + } double current_momentum = myParam.initial_momentum; double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1); @@ -568,36 +566,36 @@ int main(int argc, char** argv) } for (int epoch=0; epoch<myParam.num_epochs; epoch++) - { + { cerr << "Epoch " << epoch+1 << endl; cerr << "Current learning rate: " << current_learning_rate << endl; - if (myParam.use_momentum) - cerr << "Current momentum: " << current_momentum << endl; - else + if (myParam.use_momentum) + cerr << "Current momentum: " << current_momentum << endl; + else current_momentum = -1; - cerr << "Training minibatches: "; + cerr << "Training minibatches: "; - double log_likelihood = 0.0; + double log_likelihood = 0.0; - int num_samples = 0; - if (loss_function == LogLoss) - num_samples = output_vocab_size; - else if (loss_function == NCELoss) - num_samples = 1+num_noise_samples; + int num_samples = 0; + if (loss_function == LogLoss) + num_samples = output_vocab_size; + else if (loss_function == NCELoss) + num_samples = 1+num_noise_samples; - Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); - Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size); - Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size); - Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); + Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size); + Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size); for(data_size_t batch=0;batch<num_batches;batch++) { if (batch > 0 && batch % 10000 == 0) { - cerr << batch <<"..."; - } + cerr << batch <<"..."; + } if (batch > 0 && batch % 500000 == 0) { @@ -605,31 +603,31 @@ int main(int argc, char** argv) compute_validation_perplexity(ngram_size, output_vocab_size, validation_minibatch_size, validation_data_size, num_validation_batches, myParam, prop_validation, validation_data, current_learning_rate, current_validation_ll); cerr << "Current learning rate: " << current_learning_rate << endl; } - + data_size_t minibatch_start_index = minibatch_size * batch; int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index); #ifdef MAP - Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); - #else + Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); + #else //ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file - Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size); - //cerr<<"Minibatch start index "<<minibatch_start_index<<endl; - //cerr<<"Minibatch size "<<current_minibatch_size<<endl; + Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size); + //cerr<<"Minibatch start index "<<minibatch_start_index<<endl; + //cerr<<"Minibatch size "<<current_minibatch_size<<endl; if (use_mmap_file == true) { minibatch.setZero(ngram_size,current_minibatch_size); //now reading the ngrams from the mmaped file for (int k=0; k<ngram_size; k++){ for (data_size_t index = 0 ; index<current_minibatch_size; index++) { - data_size_t current_index = index + minibatch_start_index; - //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl; + data_size_t current_index = index + minibatch_start_index; + //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl; minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k); } } } else { minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); } - #endif + #endif double adjusted_learning_rate = current_learning_rate/minibatch_size; //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl; @@ -648,20 +646,20 @@ int main(int argc, char** argv) prop.fProp(minibatch.topRows(ngram_size-1)); - if (loss_function == NCELoss) - { - ///// Noise-contrastive estimation + if (loss_function == NCELoss) + { + ///// Noise-contrastive estimation - // Generate noise samples. Gather positive and negative samples into matrix. + // Generate noise samples. Gather positive and negative samples into matrix. - start_timer(3); + start_timer(3); minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1); - + for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++) for (int train_id = 0; train_id < current_minibatch_size; train_id++) minibatch_samples(sample_id, train_id) = unigram.sample(rng); - + stop_timer(3); // Final forward propagation step (sparse) @@ -686,7 +684,7 @@ int main(int argc, char** argv) double minibatch_log_likelihood; start_timer(5); - softmax_loss.fProp(scores.leftCols(current_minibatch_size), + softmax_loss.fProp(scores.leftCols(current_minibatch_size), minibatch_samples, probs, minibatch_log_likelihood); stop_timer(5); @@ -697,9 +695,9 @@ int main(int argc, char** argv) start_timer(6); softmax_loss.bProp(probs, minibatch_weights); stop_timer(6); - + // Update the normalization parameters - + if (myParam.normalization) { for (int train_id = 0;train_id < current_minibatch_size;train_id++) @@ -711,19 +709,19 @@ int main(int argc, char** argv) // Be careful of short minibatch prop.bProp(minibatch.topRows(ngram_size-1), - minibatch_samples.leftCols(current_minibatch_size), + minibatch_samples.leftCols(current_minibatch_size), minibatch_weights.leftCols(current_minibatch_size), - adjusted_learning_rate, + adjusted_learning_rate, current_momentum, myParam.L2_reg, myParam.parameter_update, myParam.conditioning_constant, myParam.decay); - } - else if (loss_function == LogLoss) - { - ///// Standard log-likelihood - start_timer(4); + } + else if (loss_function == LogLoss) + { + ///// Standard log-likelihood + start_timer(4); if (prop.skip_hidden) prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); else @@ -732,21 +730,21 @@ int main(int argc, char** argv) double minibatch_log_likelihood; start_timer(5); - SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), - minibatch.row(ngram_size-1), - probs, + SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), + minibatch.row(ngram_size-1), + probs, minibatch_log_likelihood); stop_timer(5); log_likelihood += minibatch_log_likelihood; ///// Backward propagation - + start_timer(6); - SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), - probs.leftCols(current_minibatch_size), + SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), + probs.leftCols(current_minibatch_size), minibatch_weights); stop_timer(6); - + prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size), minibatch_weights, adjusted_learning_rate, @@ -757,33 +755,33 @@ int main(int argc, char** argv) myParam.decay); } } - cerr << "done." << endl; + cerr << "done." << endl; - if (loss_function == LogLoss) - { - cerr << "Training log-likelihood: " << log_likelihood << endl; + if (loss_function == LogLoss) + { + cerr << "Training log-likelihood: " << log_likelihood << endl; cerr << " perplexity: "<< exp(-log_likelihood/training_data_size) << endl; - } - else if (loss_function == NCELoss) - cerr << "Training NCE log-likelihood: " << log_likelihood << endl; + } + else if (loss_function == NCELoss) + cerr << "Training NCE log-likelihood: " << log_likelihood << endl; current_momentum += momentum_delta; - #ifdef USE_CHRONO - cerr << "Propagation times:"; - for (int i=0; i<timer.size(); i++) - cerr << " " << timer.get(i); - cerr << endl; - #endif - - if (myParam.model_prefix != "") - { - cerr << "Writing model" << endl; - if (myParam.input_words_file != "") - nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words); - else - nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1)); - } + #ifdef USE_CHRONO + cerr << "Propagation times:"; + for (int i=0; i<timer.size(); i++) + cerr << " " << timer.get(i); + cerr << endl; + #endif + + if (myParam.model_prefix != "") + { + cerr << "Writing model" << endl; + if (myParam.input_words_file != "") + nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words); + else + nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1)); + } if (epoch % 1 == 0 && validation_data_size > 0) { @@ -793,4 +791,3 @@ int main(int argc, char** argv) } return 0; } - diff --git a/src/types.hpp b/src/types.hpp deleted file mode 100644 index 08b010f..0000000 --- a/src/types.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef TYPES_HPP -#define TYPES_HPP - -#include <cmath> -#include <string> -#include <vector> -#include <boost/cstdint.hpp> -#include <limits> - -namespace biglm{ - -typedef double weight_type; -const weight_type IMPOSSIBLE = -HUGE_VAL; - -typedef unsigned long block_type; -const size_t bits_per_block = (std::numeric_limits<block_type>::digits); - //typedef std::size_t size_type; -typedef boost::uint64_t size_type; -typedef unsigned char byte_type; - -template<typename T> -struct bytes { - static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); } - static size_type size(const T& key) { return sizeof(T); } -}; - -template<> -struct bytes<std::string> { - static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); } - static size_type size(const std::string& key) { return key.size(); } -}; - -template<typename U> -struct bytes<std::vector<U> > { - static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); } - static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); } -}; - -} //namespace nplm - -#endif @@ -15,7 +15,6 @@ #include <boost/chrono.hpp> #endif -//#include <../3rdparty/Eigen/Dense> #include <Eigen/Dense> #include "maybe_omp.h" @@ -23,15 +22,15 @@ // Make matrices hashable namespace Eigen { - template <typename Derived> - size_t hash_value(const DenseBase<Derived> &m) - { - size_t h=0; - for (int i=0; i<m.rows(); i++) - for (int j=0; j<m.cols(); j++) - boost::hash_combine(h, m(i,j)); - return h; - } +template <typename Derived> +size_t hash_value(const DenseBase<Derived> &m) +{ + size_t h=0; + for (int i=0; i<m.rows(); i++) + for (int j=0; j<m.cols(); j++) + boost::hash_combine(h, m(i,j)); + return h; +} } namespace nplm @@ -73,9 +72,9 @@ void readSentFile(const std::string &file, T &sentences) } inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){ - int ngram_size = ngram.size(); - for (int i=0;i<ngram_size;i++) - int_ngram.push_back(boost::lexical_cast<int>(ngram[i])); + int ngram_size = ngram.size(); + for (int i=0;i<ngram_size;i++) + int_ngram.push_back(boost::lexical_cast<int>(ngram[i])); } // Functions that take non-const matrices as arguments @@ -85,194 +84,194 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra template <typename Derived> void initMatrix(boost::random::mt19937 &engine, - const Eigen::MatrixBase<Derived> &p_const, - bool init_normal, double range) + const Eigen::MatrixBase<Derived> &p_const, + bool init_normal, double range) { - UNCONST(Derived, p_const, p); - if (init_normal == 0) - // initialize with uniform distribution in [-range, range] + UNCONST(Derived, p_const, p); + if (init_normal == 0) + // initialize with uniform distribution in [-range, range] + { + boost::random::uniform_real_distribution<> unif_real(-range, range); + for (int i = 0; i < p.rows(); i++) { - boost::random::uniform_real_distribution<> unif_real(-range, range); - for (int i = 0; i < p.rows(); i++) - { - for (int j = 0; j< p.cols(); j++) - { - p(i,j) = unif_real(engine); - } - } - + for (int j = 0; j< p.cols(); j++) + { + p(i,j) = unif_real(engine); + } } - else - // initialize with gaussian distribution with mean 0 and stdev range + + } + else + // initialize with gaussian distribution with mean 0 and stdev range + { + boost::random::normal_distribution<double> unif_normal(0., range); + for (int i = 0; i < p.rows(); i++) { - boost::random::normal_distribution<double> unif_normal(0., range); - for (int i = 0; i < p.rows(); i++) - { - for (int j = 0; j < p.cols(); j++) - { - p(i,j) = unif_normal(engine); - } - } + for (int j = 0; j < p.cols(); j++) + { + p(i,j) = unif_normal(engine); + } } + } } template <typename Derived> void initBias(boost::random::mt19937 &engine, - const Eigen::MatrixBase<Derived> &p_const, - bool init_normal, double range) + const Eigen::MatrixBase<Derived> &p_const, + bool init_normal, double range) { - UNCONST(Derived, p_const, p); - if (init_normal == 0) - // initialize with uniform distribution in [-range, range] + UNCONST(Derived, p_const, p); + if (init_normal == 0) + // initialize with uniform distribution in [-range, range] + { + boost::random::uniform_real_distribution<> unif_real(-range, range); + for (int i = 0; i < p.size(); i++) { - boost::random::uniform_real_distribution<> unif_real(-range, range); - for (int i = 0; i < p.size(); i++) - { - p(i) = unif_real(engine); - } - + p(i) = unif_real(engine); } - else - // initialize with gaussian distribution with mean 0 and stdev range + + } + else + // initialize with gaussian distribution with mean 0 and stdev range + { + boost::random::normal_distribution<double> unif_normal(0., range); + for (int i = 0; i < p.size(); i++) { - boost::random::normal_distribution<double> unif_normal(0., range); - for (int i = 0; i < p.size(); i++) - { - p(i) = unif_normal(engine); - } + p(i) = unif_normal(engine); } + } } template <typename Derived> void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> ¶m_const) { - UNCONST(Derived, param_const, param); + UNCONST(Derived, param_const, param); + + int i = 0; + std::string line; + std::vector<std::string> fields; + + while (std::getline(TRAININ, line) && line != "") + { + splitBySpace(line, fields); + if (fields.size() != param.cols()) + { + std::ostringstream err; + err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")"; + throw std::runtime_error(err.str()); + } - int i = 0; - std::string line; - std::vector<std::string> fields; - - while (std::getline(TRAININ, line) && line != "") + if (i >= param.rows()) { - splitBySpace(line, fields); - if (fields.size() != param.cols()) - { - std::ostringstream err; - err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")"; - throw std::runtime_error(err.str()); - } - - if (i >= param.rows()) - { - std::ostringstream err; - err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")"; - throw std::runtime_error(err.str()); - } - - for (int j=0; j<fields.size(); j++) - { - param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]); - } - i++; + std::ostringstream err; + err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")"; + throw std::runtime_error(err.str()); } - - if (i != param.rows()) + + for (int j=0; j<fields.size(); j++) { - std::ostringstream err; - err << "error: wrong number of rows (expected " << param.rows() << ", found more)"; - throw std::runtime_error(err.str()); + param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]); } + i++; + } + + if (i != param.rows()) + { + std::ostringstream err; + err << "error: wrong number of rows (expected " << param.rows() << ", found more)"; + throw std::runtime_error(err.str()); + } } template <typename Derived> void readMatrix(const std::string ¶m_file, const Eigen::MatrixBase<Derived> ¶m_const) { - UNCONST(Derived, param_const, param); - std::cerr << "Reading data from file: " << param_file << std::endl; - - std::ifstream TRAININ(param_file.c_str()); - if (!TRAININ) - { - std::cerr << "Error: can't read training data from file " << param_file << std::endl; - exit(-1); - } - readMatrix(TRAININ, param); - TRAININ.close(); + UNCONST(Derived, param_const, param); + std::cerr << "Reading data from file: " << param_file << std::endl; + + std::ifstream TRAININ(param_file.c_str()); + if (!TRAININ) + { + std::cerr << "Error: can't read training data from file " << param_file << std::endl; + exit(-1); + } + readMatrix(TRAININ, param); + TRAININ.close(); } template <typename Derived> void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, const std::string &filename) { - std::cerr << "Writing parameters to " << filename << std::endl; + std::cerr << "Writing parameters to " << filename << std::endl; - std::ofstream OUT; - OUT.precision(16); - OUT.open(filename.c_str()); - if (! OUT) - { - std::cerr << "Error: can't write to file " << filename<< std::endl; - exit(-1); - } - writeMatrix(param, OUT); - OUT.close(); + std::ofstream OUT; + OUT.precision(16); + OUT.open(filename.c_str()); + if (! OUT) + { + std::cerr << "Error: can't write to file " << filename<< std::endl; + exit(-1); + } + writeMatrix(param, OUT); + OUT.close(); } template <typename Derived> void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, std::ofstream &OUT) { - for (int row = 0;row < param.rows();row++) + for (int row = 0;row < param.rows();row++) + { + int col; + for (col = 0;col < param.cols()-1;col++) { - int col; - for (col = 0;col < param.cols()-1;col++) - { - OUT<<param(row,col)<<"\t"; - } - //dont want an extra tab at the end - OUT<<param(row,col)<<std::endl; + OUT<<param(row,col)<<"\t"; } + //dont want an extra tab at the end + OUT<<param(row,col)<<std::endl; + } } template <typename Derived> double logsum(const Eigen::MatrixBase<Derived> &v) { - int mi; - double m = v.maxCoeff(&mi); - double logz = 0.0; - for (int i=0; i<v.rows(); i++) - if (i != mi) - logz += std::exp(v(i) - m); - logz = log1p(logz) + m; - return logz; + int mi; + double m = v.maxCoeff(&mi); + double logz = 0.0; + for (int i=0; i<v.rows(); i++) + if (i != mi) + logz += std::exp(v(i) - m); + logz = log1p(logz) + m; + return logz; } double logadd(double x, double y); #ifdef USE_CHRONO -class Timer +class Timer { - typedef boost::chrono::high_resolution_clock clock_type; - typedef clock_type::time_point time_type; - typedef clock_type::duration duration_type; - std::vector<time_type> m_start; - std::vector<duration_type> m_total; -public: - Timer() { } - Timer(int n) { resize(n); } - void resize(int n) { m_start.resize(n); m_total.resize(n); } - int size() const { return m_start.size(); } - void start(int i); - void stop(int i); - void reset(int i); - double get(int i) const; + typedef boost::chrono::high_resolution_clock clock_type; + typedef clock_type::time_point time_type; + typedef clock_type::duration duration_type; + std::vector<time_type> m_start; + std::vector<duration_type> m_total; + public: + Timer() { } + Timer(int n) { resize(n); } + void resize(int n) { m_start.resize(n); m_total.resize(n); } + int size() const { return m_start.size(); } + void start(int i); + void stop(int i); + void reset(int i); + double get(int i) const; }; extern Timer timer; #define start_timer(x) timer.start(x) #define stop_timer(x) timer.stop(x) #else -#define start_timer(x) 0 -#define stop_timer(x) 0 +#define start_timer(x) (void)0 +#define stop_timer(x) (void)0 #endif int setup_threads(int n_threads); diff --git a/src/vocabulary.h b/src/vocabulary.h index a987522..c8cd518 100644 --- a/src/vocabulary.h +++ b/src/vocabulary.h @@ -5,6 +5,9 @@ #include <string> #include <queue> #include <boost/unordered_map.hpp> +#include "find_string.hpp" + +#define NPLM_HAVE_FIND_STRING_PIECE 1 namespace nplm { @@ -16,80 +19,83 @@ struct compare_second }; class vocabulary { - std::vector<std::string> m_words; - boost::unordered_map<std::string, int> m_index; - int unk; - -public: - vocabulary() - { - unk = insert_word("<unk>"); - } - - vocabulary(const std::vector<std::string> &words) + std::vector<std::string> m_words; + typedef boost::unordered_map<std::string, int> WordId; + WordId m_index; + int unk; + + public: + vocabulary() + { + unk = insert_word("<unk>"); + } + + vocabulary(const std::vector<std::string> &words) : m_words(words) + { + for (int i=0; i<words.size(); i++) + m_index[words[i]] = i; + unk = m_index["<unk>"]; + } + + int lookup_word(const std::string &word) const + { + return lookup_word(word, unk); + } + + // lookup word using custom unknown-word id + int lookup_word(const std::string &word, int unkid) const + { + WordId::const_iterator pos = m_index.find(word); + return pos == m_index.end() ? unkid : pos->second; + } + + int lookup_word(std::pair<char const*, char const*> slice) const { + return lookup_word(slice, unk); + } + + int lookup_word(std::pair<char const*, char const*> slice, int unkid) const + { + WordId::const_iterator pos = find_string(m_index, slice); + return pos == m_index.end() ? unkid : pos->second; + } + + int insert_word(const std::string &word) + { + int i = size(); + bool inserted = m_index.insert(make_pair(word, i)).second; + if (inserted) { - for (int i=0; i<words.size(); i++) - m_index[words[i]] = i; - unk = m_index["<unk>"]; - } - - int lookup_word(const std::string &word) const - { - boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word); - if (pos != m_index.end()) - return pos->second; - else - return unk; + m_words.push_back(word); } + return i; + } - // lookup word using custom unknown-word id - int lookup_word(const std::string &word, int unk) const - { - boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word); - if (pos != m_index.end()) - return pos->second; - else - return unk; - } + int size() const { return m_words.size(); } - int insert_word(const std::string &word) - { - int i = size(); - bool inserted = m_index.insert(make_pair(word, i)).second; - if (inserted) - { - m_words.push_back(word); - } - return i; - } + // Inserts the most-frequent words from counts until vocab_size words are reached. + // counts is a collection of pair<string,int> + template <typename Map> + int insert_most_frequent(const Map &counts, int vocab_size) + { + typedef std::pair<std::string,int> stringint; - int size() const { return m_words.size(); } + std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> > + q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end())); - // Inserts the most-frequent words from counts until vocab_size words are reached. - // counts is a collection of pair<string,int> - template <typename Map> - int insert_most_frequent(const Map &counts, int vocab_size) + int inserted = 0; + while (size() < vocab_size && !q.empty()) { - typedef std::pair<std::string,int> stringint; - - std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> > - q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end())); - - int inserted = 0; - while (size() < vocab_size && !q.empty()) - { - insert_word(q.top().first); - q.pop(); - inserted++; - } - return inserted; + insert_word(q.top().first); + q.pop(); + inserted++; } + return inserted; + } - const std::vector<std::string> &words() const { return m_words; } + const std::vector<std::string> &words() const { return m_words; } - const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; } }; } // namespace nplm |