diff options
author | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-08-27 16:12:52 +0300 |
---|---|---|
committer | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-08-27 16:12:52 +0300 |
commit | 9dea3fe1329ce392b9e420c89663f7bddabb068a (patch) | |
tree | cec3017bade98c13fee73edd817d228cf4214abe | |
parent | 55bf63ddd73978b46e19f3d8c5606d5677cf560f (diff) |
Makefile option (-DNPLM_DOUBLE_PRECISION) to switch between double and float.
-rw-r--r-- | src/Activation_function.h | 14 | ||||
-rw-r--r-- | src/Makefile | 3 | ||||
-rw-r--r-- | src/SoftmaxLoss.h | 22 | ||||
-rw-r--r-- | src/USCMatrix.h | 8 | ||||
-rw-r--r-- | src/clipper.h | 2 | ||||
-rw-r--r-- | src/graphClasses.h | 8 | ||||
-rw-r--r-- | src/model.cpp | 8 | ||||
-rw-r--r-- | src/model.h | 18 | ||||
-rw-r--r-- | src/neuralClasses.h | 190 | ||||
-rw-r--r-- | src/neuralLM.h | 6 | ||||
-rw-r--r-- | src/neuralNetwork.h | 20 | ||||
-rw-r--r-- | src/neuralTM.h | 6 | ||||
-rw-r--r-- | src/param.h | 24 | ||||
-rw-r--r-- | src/propagator.h | 28 | ||||
-rw-r--r-- | src/testNeuralLM.cpp | 12 | ||||
-rw-r--r-- | src/testNeuralNetwork.cpp | 8 | ||||
-rw-r--r-- | src/trainNeuralNetwork.cpp | 56 | ||||
-rw-r--r-- | src/util.cpp | 2 | ||||
-rw-r--r-- | src/util.h | 22 |
19 files changed, 236 insertions, 221 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h index 742c2fc..2eec74d 100644 --- a/src/Activation_function.h +++ b/src/Activation_function.h @@ -43,27 +43,27 @@ inline std::string activation_function_to_string (activation_function_type f) } struct hardtanh_functor { - double operator() (double x) const { if (x < -1.) return -1.; else if (x > 1.) return 1.; else return x; } + user_data_t operator() (user_data_t x) const { if (x < -1.) return -1.; else if (x > 1.) return 1.; else return x; } }; struct dhardtanh_functor { - double operator() (double x) const { return x > -1. && x < 1. ? 1. : 0.; } + user_data_t operator() (user_data_t x) const { return x > -1. && x < 1. ? 1. : 0.; } }; struct tanh_functor { - double operator() (double x) const { return std::tanh(x); } + user_data_t operator() (user_data_t x) const { return std::tanh(x); } }; struct dtanh_functor { - double operator() (double x) const { return 1-x*x; } + user_data_t operator() (user_data_t x) const { return 1-x*x; } }; struct rectifier_functor { - double operator() (double x) const { return std::max(x, 0.); } + user_data_t operator() (user_data_t x) const { return std::max(x, 0.); } }; struct drectifier_functor { - double operator() (double x) const { return x > 0. ? 1. : 0.; } + user_data_t operator() (user_data_t x) const { return x > 0. ? 1. : 0.; } }; class Activation_function @@ -78,7 +78,7 @@ class Activation_function void set_activation_function(activation_function_type f) { this->f = f; } template <typename Engine> - void initialize(Engine &engine, bool init_normal, double init_range) { } + void initialize(Engine &engine, bool init_normal, user_data_t init_range) { } int n_inputs () const { return size; } int n_outputs () const { return size; } diff --git a/src/Makefile b/src/Makefile index 2a27405..fd2a665 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,6 +8,9 @@ CXX=g++ #CFLAGS=-g CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG $(CXXFLAGS) +# switch between single precision and double precision (single precision recommended on GPU for speed) +CFLAGS+= -DNPLM_DOUBLE_PRECISION=0 + # Architecture. Set to x86_64 or i686 to override. ARCH:=$(shell uname -m) # Operating system. Set to override (the only option that makes any difference is Darwin). diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h index d89cde6..cdd66c8 100644 --- a/src/SoftmaxLoss.h +++ b/src/SoftmaxLoss.h @@ -40,16 +40,16 @@ inline std::string loss_function_to_string (loss_function_type f) struct SoftmaxLogLoss { template <typename DerivedI, typename DerivedW, typename DerivedO> - void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss) + void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, user_data_t &loss) { UNCONST(DerivedO, output_const, output); - double log_likelihood = 0.0; + user_data_t log_likelihood = 0.0; #pragma omp parallel for reduction(+:log_likelihood) for (int train_id = 0; train_id < input.cols(); train_id++) { - double normalization = logsum(input.col(train_id)); + user_data_t normalization = logsum(input.col(train_id)); output.col(train_id).array() = input.col(train_id).array() - normalization; log_likelihood += output(output_words(train_id), train_id); } @@ -91,12 +91,12 @@ class SoftmaxNCELoss template <typename DerivedI, typename DerivedW, typename DerivedO> void fProp(const MatrixBase<DerivedI> &scores, const MatrixBase<DerivedW> &minibatch_samples, - const MatrixBase<DerivedO> &output_const, double &loss) + const MatrixBase<DerivedO> &output_const, user_data_t &loss) { UNCONST(DerivedO, output_const, output); - double log_likelihood = 0.0; + user_data_t log_likelihood = 0.0; int num_noise_samples = minibatch_samples.rows()-1; - double log_num_noise_samples = std::log(num_noise_samples); + user_data_t log_num_noise_samples = std::log(num_noise_samples); #pragma omp parallel for reduction(+:log_likelihood) schedule(static) for (int train_id = 0; train_id < scores.cols(); train_id++) { @@ -106,11 +106,11 @@ class SoftmaxNCELoss // To avoid zero or infinite probabilities, // never take exp of score without normalizing first, // even if it's a little slower... - double score = scores(sample_id, train_id); - double score_noise = log_num_noise_samples + unigram.logprob(sample); - double z = logadd(score, score_noise); - double logprob = score - z; - double logprob_noise = score_noise - z; + user_data_t score = scores(sample_id, train_id); + user_data_t score_noise = log_num_noise_samples + unigram.logprob(sample); + user_data_t z = logadd(score, score_noise); + user_data_t logprob = score - z; + user_data_t logprob_noise = score_noise - z; output(sample_id, train_id) = std::exp(logprob); log_likelihood += sample_id == 0 ? logprob : logprob_noise; } diff --git a/src/USCMatrix.h b/src/USCMatrix.h index 784fa1b..821703a 100644 --- a/src/USCMatrix.h +++ b/src/USCMatrix.h @@ -70,7 +70,7 @@ class USCMatrix // Dense matrix - sparse matrix product // a is presumably very wide template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC> -void uscgemm(double alpha, const MatrixBase<DerivedA> &a, +void uscgemm(user_data_t alpha, const MatrixBase<DerivedA> &a, const USCMatrix<ScalarB,Index> &b, const MatrixBase<DerivedC> &c_const) { @@ -92,7 +92,7 @@ void uscgemm(double alpha, const MatrixBase<DerivedA> &a, // sparse matrix - dense matrix product template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> -void uscgemm(double alpha, +void uscgemm(user_data_t alpha, const USCMatrix<ScalarA,Index> &a, const MatrixBase<DerivedB> &b, const MatrixBase<DerivedC> &c_const) @@ -145,7 +145,7 @@ void uscgemm(double alpha, // For b, column-major is preferred. template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index> -void uscgemm_masked(double alpha, +void uscgemm_masked(user_data_t alpha, const MatrixBase<DerivedA> &a, const MatrixBase<DerivedB> &b, USCMatrix<ScalarC,Index> &c) @@ -167,7 +167,7 @@ void uscgemm_masked(double alpha, // sparse matrix - dense vector product template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC> -void uscgemv(double alpha, +void uscgemv(user_data_t alpha, const USCMatrix<ScalarA,Index> &a, const MatrixBase<DerivedB> &b, const MatrixBase<DerivedC> &c_const) diff --git a/src/clipper.h b/src/clipper.h index dda5c4d..e8c96a6 100644 --- a/src/clipper.h +++ b/src/clipper.h @@ -3,7 +3,7 @@ namespace nplm { struct Clipper{ - double operator() (double x) const { + user_data_t operator() (user_data_t x) const { return std::min(0.5, std::max(x,-0.5)); //return(x); } diff --git a/src/graphClasses.h b/src/graphClasses.h index cd80a4c..1282048 100644 --- a/src/graphClasses.h +++ b/src/graphClasses.h @@ -14,8 +14,8 @@ class Node { X * param; //what parameter is this //vector <void *> children; //vector <void *> parents; - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; + Eigen::Matrix<user_data_t,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix; + Eigen::Matrix<user_data_t,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix; int minibatch_size; public: @@ -44,11 +44,11 @@ class Node { void resize() { resize(minibatch_size); } /* - void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols) + void Fprop(Matrix<user_data_t,Dynamic,Dynamic> & input,int n_cols) { param->fProp(input,fProp_matrix,0,0,n_cols); } - void Fprop(Matrix<double,1,Dynamic> & input,int n_cols) + void Fprop(Matrix<user_data_t,1,Dynamic> & input,int n_cols) { param->fProp(input,fProp_matrix,0,0,n_cols); } diff --git a/src/model.cpp b/src/model.cpp index db7f006..9a82faf 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -46,10 +46,10 @@ void model::resize(int ngram_size, void model::initialize(boost::random::mt19937 &init_engine, bool init_normal, - double init_range, - double init_bias, + user_data_t init_range, + user_data_t init_bias, string ¶meter_update, - double adagrad_epsilon) + user_data_t adagrad_epsilon) { input_layer.initialize(init_engine, init_normal, @@ -79,7 +79,7 @@ void model::premultiply() // Since input and first_hidden_linear are both linear, // we can multiply them into a single linear layer *if* we are not training int context_size = ngram_size-1; - Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; + Matrix<user_data_t,Dynamic,Dynamic> U = first_hidden_linear.U; if (num_hidden == 0) { first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); diff --git a/src/model.h b/src/model.h index 3cce06a..4eaf5f5 100644 --- a/src/model.h +++ b/src/model.h @@ -20,7 +20,7 @@ public: Linear_layer second_hidden_linear; Activation_function second_hidden_activation; Output_word_embeddings output_layer; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix, + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix, input_embedding_matrix, input_and_output_embedding_matrix; @@ -37,13 +37,13 @@ public: bool share_embeddings) { if (share_embeddings){ - input_and_output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>(); + input_and_output_embedding_matrix = Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>(); input_layer.set_W(&input_and_output_embedding_matrix); output_layer.set_W(&input_and_output_embedding_matrix); } else { - input_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>(); - output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>(); + input_embedding_matrix = Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>(); + output_embedding_matrix = Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>(); input_layer.set_W(&input_embedding_matrix); output_layer.set_W(&output_embedding_matrix); } @@ -57,8 +57,8 @@ public: model() : ngram_size(1), premultiplied(false), activation_function(Rectifier), - output_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>()), - input_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>()) + output_embedding_matrix(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>()), + input_embedding_matrix(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>()) { output_layer.set_W(&output_embedding_matrix); input_layer.set_W(&input_embedding_matrix); @@ -73,10 +73,10 @@ public: void initialize(boost::random::mt19937 &init_engine, bool init_normal, - double init_range, - double init_bias, + user_data_t init_range, + user_data_t init_bias, string ¶meter_udpate, - double adagrad_epsilon); + user_data_t adagrad_epsilon); void set_activation_function(activation_function_type f) { diff --git a/src/neuralClasses.h b/src/neuralClasses.h index 9ead99f..182db0d 100644 --- a/src/neuralClasses.h +++ b/src/neuralClasses.h @@ -34,7 +34,7 @@ using Eigen::Dynamic; typedef boost::unordered_map<int,bool> int_map; struct Clipper{ - double operator() (double x) const { + user_data_t operator() (user_data_t x) const { return std::min(0.5, std::max(x,-0.5)); //return(x); } @@ -44,17 +44,17 @@ struct Clipper{ class Linear_layer { private: - Matrix<double,Dynamic,Dynamic> U; - Matrix<double,Dynamic,Dynamic> U_gradient; - Matrix<double,Dynamic,Dynamic> U_velocity; - Matrix<double,Dynamic,Dynamic> U_running_gradient; - Matrix<double,Dynamic,Dynamic> U_running_parameter_update; + Matrix<user_data_t,Dynamic,Dynamic> U; + Matrix<user_data_t,Dynamic,Dynamic> U_gradient; + Matrix<user_data_t,Dynamic,Dynamic> U_velocity; + Matrix<user_data_t,Dynamic,Dynamic> U_running_gradient; + Matrix<user_data_t,Dynamic,Dynamic> U_running_parameter_update; // Biases - Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,1> b_velocity; - Matrix<double,Dynamic,1> b_running_gradient; - Matrix<double,Dynamic,1> b_running_parameter_update; - Matrix<double,Dynamic,1> b_gradient; + Matrix<user_data_t,Dynamic,1> b; + Matrix<user_data_t,Dynamic,1> b_velocity; + Matrix<user_data_t,Dynamic,1> b_running_gradient; + Matrix<user_data_t,Dynamic,1> b_running_parameter_update; + Matrix<user_data_t,Dynamic,1> b_gradient; friend class model; @@ -84,13 +84,13 @@ class Linear_layer template <typename Engine> void initialize(Engine &engine, bool init_normal, - double init_range, + user_data_t init_range, string ¶meter_update, - double adagrad_epsilon) + user_data_t adagrad_epsilon) { if (parameter_update == "ADA") { - U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; - b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + U_running_gradient = Matrix<user_data_t,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; + b_running_gradient = Matrix<user_data_t,Dynamic,1>::Ones(b.size())*adagrad_epsilon; } if (parameter_update == "ADAD") { U_running_gradient.setZero(U.rows(),U.cols()); @@ -147,7 +147,7 @@ class Linear_layer template <typename DerivedGOut, typename DerivedIn> void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, double momentum, double L2_reg) + user_data_t learning_rate, user_data_t momentum, user_data_t L2_reg) { U_gradient.noalias() = bProp_input*fProp_input.transpose(); @@ -185,8 +185,8 @@ class Linear_layer template <typename DerivedGOut, typename DerivedIn> void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, - double L2_reg) + user_data_t learning_rate, + user_data_t L2_reg) { U_gradient.noalias() = bProp_input*fProp_input.transpose(); @@ -224,15 +224,15 @@ class Linear_layer template <typename DerivedGOut, typename DerivedIn> void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, - double L2_reg, - double conditioning_constant, - double decay) + user_data_t learning_rate, + user_data_t L2_reg, + user_data_t conditioning_constant, + user_data_t decay) { //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl; U_gradient.noalias() = bProp_input*fProp_input.transpose(); - Array<double,Dynamic,1> b_current_parameter_update; + Array<user_data_t,Dynamic,1> b_current_parameter_update; // get the bias gradient for all dimensions in parallel int size = b.size(); @@ -248,7 +248,7 @@ class Linear_layer #pragma omp parallel for //cerr<<"U gradient is "<<U_gradient<<endl; for (int col=0; col<U.cols(); col++) { - Array<double,Dynamic,1> U_current_parameter_update; + Array<user_data_t,Dynamic,1> U_current_parameter_update; U_running_gradient.col(col) = decay*U_running_gradient.col(col) + (1-decay)*U_gradient.col(col).array().square().matrix(); //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; @@ -288,18 +288,18 @@ class Output_word_embeddings { private: // row-major is better for uscgemm - //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W; + //Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W; // Having W be a pointer to a matrix allows ease of sharing // input and output word embeddings - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; - std::vector<double> W_data; - Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; - Matrix<double,Dynamic,1> b_running_gradient; - Matrix<double,Dynamic,1> b_gradient; - Matrix<double,Dynamic,1> b_running_parameter_update; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *W; + std::vector<user_data_t> W_data; + Matrix<user_data_t,Dynamic,1> b; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<user_data_t,Dynamic,1> b_running_gradient; + Matrix<user_data_t,Dynamic,1> b_gradient; + Matrix<user_data_t,Dynamic,1> b_running_parameter_update; public: Output_word_embeddings() { } @@ -310,7 +310,7 @@ class Output_word_embeddings W->setZero(rows, cols); b.setZero(rows); } - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + void set_W(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { W = input_W; } void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); } @@ -321,17 +321,17 @@ class Output_word_embeddings template <typename Engine> void initialize(Engine &engine, bool init_normal, - double init_range, - double init_bias, + user_data_t init_range, + user_data_t init_bias, string ¶meter_update, - double adagrad_epsilon) + user_data_t adagrad_epsilon) { W_gradient.setZero(W->rows(),W->cols()); b_gradient.setZero(b.size()); if (parameter_update == "ADA") { - W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; - b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + W_running_gradient = Matrix<user_data_t,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + b_running_gradient = Matrix<user_data_t,Dynamic,1>::Ones(b.size())*adagrad_epsilon; //W_gradient.setZero(W->rows(),W->cols()); //b_gradient.setZero(b.size()); } @@ -359,9 +359,9 @@ class Output_word_embeddings my_output = ((*W) * input).colwise() + b; /* TODO: without EIGEN_NO_DEBUG - is this a bug? ProductBase.h:102: Eigen::ProductBase<Derived, Lhs, Rhs>::ProductBase(const Lhs& - , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<double, -1, -1 - , 1>, Eigen::Matrix<double, -1, -1>, 5>; Lhs = Eigen::Matrix<double, -1, -1, 1>; - Rhs = Eigen::Matrix<double, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() & + , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<user_data_t, -1, -1 + , 1>, Eigen::Matrix<user_data_t, -1, -1>, 5>; Lhs = Eigen::Matrix<user_data_t, -1, -1, 1>; + Rhs = Eigen::Matrix<user_data_t, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() & & "invalid matrix product" && "if you wanted a coeff-wise or a dot product use t he respective explicit functions"' failed. @@ -394,14 +394,14 @@ class Output_word_embeddings my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); } } - USCMatrix<double> sparse_output(W->rows(), samples, my_output); + USCMatrix<user_data_t> sparse_output(W->rows(), samples, my_output); uscgemm_masked(1.0, *W, input, sparse_output); my_output = sparse_output.values; // too bad, so much copying } // Return single element of output matrix template <typename DerivedIn> - double fProp(const MatrixBase<DerivedIn> &input, + user_data_t fProp(const MatrixBase<DerivedIn> &input, int word, int instance) const { @@ -425,8 +425,8 @@ class Output_word_embeddings template <typename DerivedIn, typename DerivedGOut> void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate, - double momentum) //not sure if we want to use momentum here + user_data_t learning_rate, + user_data_t momentum) //not sure if we want to use momentum here { // W is vocab_size x output_embedding_dimension // b is vocab_size x 1 @@ -451,7 +451,7 @@ class Output_word_embeddings void computeGradientAdagrad( const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate) //not sure if we want to use momentum here + user_data_t learning_rate) //not sure if we want to use momentum here { // W is vocab_size x output_embedding_dimension // b is vocab_size x 1 @@ -475,16 +475,16 @@ class Output_word_embeddings template <typename DerivedIn, typename DerivedGOut> void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate, - double conditioning_constant, - double decay) //not sure if we want to use momentum here + user_data_t learning_rate, + user_data_t conditioning_constant, + user_data_t decay) //not sure if we want to use momentum here { // W is vocab_size x output_embedding_dimension // b is vocab_size x 1 // predicted_embeddings is output_embedding_dimension x minibatch_size // bProp_input is vocab_size x minibatch_size - Array<double,Dynamic,Dynamic> W_current_parameter_update; - Array<double,Dynamic,1> b_current_parameter_update; + Array<user_data_t,Dynamic,Dynamic> W_current_parameter_update; + Array<user_data_t,Dynamic,1> b_current_parameter_update; W_gradient.setZero(W->rows(), W->cols()); b_gradient.setZero(b.size()); W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); @@ -519,7 +519,7 @@ class Output_word_embeddings my_bProp_matrix.setZero(); uscgemm(1.0, W->transpose(), - USCMatrix<double>(W->rows(), samples, weights), + USCMatrix<user_data_t>(W->rows(), samples, weights), my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch } @@ -527,29 +527,29 @@ class Output_word_embeddings void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOutI> &samples, const MatrixBase<DerivedGOutV> &weights, - double learning_rate, double momentum) //not sure if we want to use momentum here + user_data_t learning_rate, user_data_t momentum) //not sure if we want to use momentum here { //cerr<<"in gradient"<<endl; - USCMatrix<double> gradient_output(W->rows(), samples, weights); + USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights); uscgemm(learning_rate, gradient_output, predicted_embeddings.leftCols(gradient_output.cols()).transpose(), *W); // narrow predicted_embeddings for possible short minibatch uscgemv(learning_rate, gradient_output, - Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), + Matrix<user_data_t,Dynamic,1>::Ones(gradient_output.cols()), b); /* //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT //FIRST - USCMatrix<double> gradient_output(W->rows(), samples, weights); + USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights); uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), + Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated @@ -571,7 +571,7 @@ class Output_word_embeddings //b(update_item) += learning_rate * b_gradient(update_item); //UPDATE CLIPPING W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix(); - double update = learning_rate * b_gradient(update_item); + user_data_t update = learning_rate * b_gradient(update_item); b(update_item) += std::min(0.5, std::max(update,-0.5)); //GRADIENT CLIPPING W_gradient.row(update_item).setZero(); @@ -585,19 +585,19 @@ class Output_word_embeddings void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOutI> &samples, const MatrixBase<DerivedGOutV> &weights, - double learning_rate) //not sure if we want to use momentum here + user_data_t learning_rate) //not sure if we want to use momentum here { //W_gradient.setZero(W->rows(), W->cols()); //b_gradient.setZero(b.size()); //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE - USCMatrix<double> gradient_output(W->rows(), samples, weights); + USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights); uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), + Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated @@ -622,7 +622,7 @@ class Output_word_embeddings /* //UPDATE CLIPPING W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix(); - double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + user_data_t update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5)); */ W_gradient.row(update_item).setZero(); @@ -634,22 +634,22 @@ class Output_word_embeddings void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, const MatrixBase<DerivedGOutI> &samples, const MatrixBase<DerivedGOutV> &weights, - double learning_rate, - double conditioning_constant, - double decay) //not sure if we want to use momentum here + user_data_t learning_rate, + user_data_t conditioning_constant, + user_data_t decay) //not sure if we want to use momentum here { //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; //W_gradient.setZero(W->rows(), W->cols()); //b_gradient.setZero(b.size()); - USCMatrix<double> gradient_output(W->rows(), samples, weights); + USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights); uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), W_gradient); uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), + Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()), b_gradient); int_map update_map; //stores all the parameters that have been updated @@ -666,8 +666,8 @@ class Output_word_embeddings #pragma omp parallel for for (int item_id=0; item_id<num_items; item_id++) { - Array<double,1,Dynamic> W_current_parameter_update; - double b_current_parameter_update; + Array<user_data_t,1,Dynamic> W_current_parameter_update; + user_data_t b_current_parameter_update; int update_item = update_items[item_id]; W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ @@ -715,24 +715,24 @@ class Output_word_embeddings UNCONST(DerivedGb, gradient_b, my_gradient_b); my_gradient_W.setZero(); my_gradient_b.setZero(); - USCMatrix<double> gradient_output(W->rows(), samples, weights); + USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights); uscgemm(1.0, gradient_output, predicted_embeddings.leftCols(samples.cols()).transpose(), my_gradient_W); uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); + Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()), my_gradient_b); } }; class Input_word_embeddings { private: - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *W; int context_size, vocab_size; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; friend class model; @@ -740,7 +740,7 @@ class Input_word_embeddings Input_word_embeddings() : context_size(0), vocab_size(0) { } Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + void set_W(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { W = input_W; } @@ -762,14 +762,14 @@ class Input_word_embeddings template <typename Engine> void initialize(Engine &engine, bool init_normal, - double init_range, + user_data_t init_range, string ¶meter_update, - double adagrad_epsilon) + user_data_t adagrad_epsilon) { W_gradient.setZero(W->rows(),W->cols()); if (parameter_update == "ADA") { - W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + W_running_gradient = Matrix<user_data_t,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; //W_gradient.setZero(W->rows(),W->cols()); } if (parameter_update == "ADAD") { @@ -820,7 +820,7 @@ class Input_word_embeddings // so narrow output to match uscgemm(1.0, W->transpose(), - USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), + USCMatrix<user_data_t>(W->rows(),input.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input.cols())), my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); } } @@ -840,7 +840,7 @@ class Input_word_embeddings template <typename DerivedGOut, typename DerivedIn> void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &input_words, - double learning_rate, double momentum, double L2_reg) + user_data_t learning_rate, user_data_t momentum, user_data_t L2_reg) { int embedding_dimension = W->cols(); @@ -857,7 +857,7 @@ class Input_word_embeddings for (int ngram=0; ngram<context_size; ngram++) { uscgemm(learning_rate, - USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), + USCMatrix<user_data_t>(W->rows(), input_words.middleRows(ngram, 1), Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), *W); } @@ -869,7 +869,7 @@ class Input_word_embeddings for (int ngram=0; ngram<context_size; ngram++) { uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); } @@ -909,8 +909,8 @@ class Input_word_embeddings template <typename DerivedGOut, typename DerivedIn> void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &input_words, - double learning_rate, - double L2_reg) + user_data_t learning_rate, + user_data_t L2_reg) { int embedding_dimension = W->cols(); //W_gradient.setZero(W->rows(), W->cols()); @@ -921,7 +921,7 @@ class Input_word_embeddings for (int ngram=0; ngram<context_size; ngram++) { uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); } @@ -962,10 +962,10 @@ class Input_word_embeddings template <typename DerivedGOut, typename DerivedIn> void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, const MatrixBase<DerivedIn> &input_words, - double learning_rate, - double L2_reg, - double conditioning_constant, - double decay) + user_data_t learning_rate, + user_data_t L2_reg, + user_data_t conditioning_constant, + user_data_t decay) { int embedding_dimension = W->cols(); @@ -977,7 +977,7 @@ class Input_word_embeddings for (int ngram=0; ngram<context_size; ngram++) { uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), W_gradient); } @@ -1002,7 +1002,7 @@ class Input_word_embeddings for (int item_id=0; item_id<num_items; item_id++) { - Array<double,1,Dynamic> W_current_parameter_update; + Array<user_data_t,1,Dynamic> W_current_parameter_update; int update_item = update_items[item_id]; W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ (1.-decay)*W_gradient.row(update_item).array().square().matrix(); @@ -1035,7 +1035,7 @@ class Input_word_embeddings my_gradient.setZero(); for (int ngram=0; ngram<context_size; ngram++) uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())), bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), my_gradient); } diff --git a/src/neuralLM.h b/src/neuralLM.h index f0eebd8..a9a0dfe 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -80,7 +80,7 @@ class neuralLM : public neuralNetwork, graehl::replace_digits return vocab->lookup_word(slice); } - double lookup_ngram(const int *ngram_a, int n) + user_data_t lookup_ngram(const int *ngram_a, int n) { Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); for (int i=0; i<m->ngram_size; ++i) @@ -100,13 +100,13 @@ class neuralLM : public neuralNetwork, graehl::replace_digits return neuralNetwork::lookup_ngram(ngram); } - double lookup_ngram(const std::vector<int> &ngram_v) + user_data_t lookup_ngram(const std::vector<int> &ngram_v) { return lookup_ngram(ngram_v.data(), ngram_v.size()); } template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + user_data_t lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) { return neuralNetwork::lookup_ngram(ngram); } diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h index 6386a0f..c5d76dc 100644 --- a/src/neuralNetwork.h +++ b/src/neuralNetwork.h @@ -20,13 +20,13 @@ class neuralNetwork private: bool normalization; - double weight; + user_data_t weight; propagator prop; std::size_t cache_size; Eigen::Matrix<int,Dynamic,Dynamic> cache_keys; - std::vector<double> cache_values; + std::vector<user_data_t> cache_values; int cache_lookups, cache_hits; public: @@ -40,7 +40,7 @@ class neuralNetwork } void set_normalization(bool value) { normalization = value; } - void set_log_base(double value) { weight = 1./std::log(value); } + void set_log_base(user_data_t value) { weight = 1./std::log(value); } // This must be called if the underlying model is resized. void resize() { @@ -58,7 +58,7 @@ class neuralNetwork } template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + user_data_t lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) { assert (ngram.rows() == m->ngram_size); assert (ngram.cols() == 1); @@ -90,17 +90,17 @@ class neuralNetwork prop.fProp(ngram.col(0)); int output = ngram(m->ngram_size-1, 0); - double log_prob; + user_data_t log_prob; start_timer(3); if (normalization) { - Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size); + Eigen::Matrix<user_data_t,Eigen::Dynamic,1> scores(m->output_vocab_size); if (prop.skip_hidden) prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); else prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); - double logz = logsum(scores.col(0)); + user_data_t logz = logsum(scores.col(0)); log_prob = weight * (scores(output, 0) - logz); } else @@ -140,15 +140,15 @@ class neuralNetwork if (normalization) { - Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); + Eigen::Matrix<user_data_t,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols()); if (prop.skip_hidden) prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); else prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); // And softmax and loss - Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); - double minibatch_log_likelihood; + Matrix<user_data_t,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols()); + user_data_t minibatch_log_likelihood; SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood); for (int j=0; j<ngram.cols(); j++) { diff --git a/src/neuralTM.h b/src/neuralTM.h index 9bb6d16..9482161 100644 --- a/src/neuralTM.h +++ b/src/neuralTM.h @@ -91,7 +91,7 @@ class neuralTM : public neuralNetwork, graehl::replace_digits return lookup_word(word, *output_vocab); } - double lookup_ngram(const int *ngram_a, int n) + user_data_t lookup_ngram(const int *ngram_a, int n) { Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size); for (int i=0; i<m->ngram_size; i++) @@ -111,13 +111,13 @@ class neuralTM : public neuralNetwork, graehl::replace_digits return neuralNetwork::lookup_ngram(ngram); } - double lookup_ngram(const std::vector<int> &ngram_v) + user_data_t lookup_ngram(const std::vector<int> &ngram_v) { return lookup_ngram(ngram_v.data(), ngram_v.size()); } template <typename Derived> - double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) + user_data_t lookup_ngram(const Eigen::MatrixBase<Derived> &ngram) { return neuralNetwork::lookup_ngram(ngram); } diff --git a/src/param.h b/src/param.h index 9ec486b..fe1b6d6 100644 --- a/src/param.h +++ b/src/param.h @@ -3,6 +3,12 @@ #include <string> +#ifdef NPLM_DOUBLE_PRECISION +typedef double user_data_t; +#else +typedef float user_data_t; +#endif + namespace nplm { @@ -35,26 +41,26 @@ struct param int minibatch_size; int validation_minibatch_size; int num_epochs; - double learning_rate; - double conditioning_constant; - double decay; - double adagrad_epsilon; + user_data_t learning_rate; + user_data_t conditioning_constant; + user_data_t decay; + user_data_t adagrad_epsilon; bool init_normal; - double init_range; + user_data_t init_range; int num_noise_samples; bool use_momentum; - double initial_momentum; - double final_momentum; + user_data_t initial_momentum; + user_data_t final_momentum; - double L2_reg; + user_data_t L2_reg; double input_dropout; int null_index; bool normalization; - double normalization_init; + user_data_t normalization_init; int num_threads; int debug; diff --git a/src/propagator.h b/src/propagator.h index 6344f2f..45d4018 100644 --- a/src/propagator.h +++ b/src/propagator.h @@ -71,7 +71,7 @@ class propagator { else { int n_inputs = first_hidden_linear_node.param->n_inputs(); - USCMatrix<double> sparse_data; + USCMatrix<user_data_t> sparse_data; input_layer_node.param->munge(data, sparse_data); start_timer(1); @@ -101,12 +101,12 @@ class propagator { template <typename DerivedIn, typename DerivedOut> void bProp(const MatrixBase<DerivedIn> &data, const MatrixBase<DerivedOut> &output, - double learning_rate, - double momentum, - double L2_reg, + user_data_t learning_rate, + user_data_t momentum, + user_data_t L2_reg, std::string ¶meter_update, - double conditioning_constant, - double decay) + user_data_t conditioning_constant, + user_data_t decay) { // Output embedding layer @@ -153,12 +153,12 @@ class propagator { void bProp(const MatrixBase<DerivedIn> &data, const MatrixBase<DerivedOutI> &samples, const MatrixBase<DerivedOutV> &weights, - double learning_rate, - double momentum, - double L2_reg, + user_data_t learning_rate, + user_data_t momentum, + user_data_t L2_reg, std::string ¶meter_update, - double conditioning_constant, - double decay) + user_data_t conditioning_constant, + user_data_t decay) { // Output embedding layer @@ -210,10 +210,10 @@ class propagator { private: template <typename DerivedIn> void bPropRest(const MatrixBase<DerivedIn> &data, - double learning_rate, double momentum, double L2_reg, + user_data_t learning_rate, user_data_t momentum, user_data_t L2_reg, std::string ¶meter_update, - double conditioning_constant, - double decay) + user_data_t conditioning_constant, + user_data_t decay) { // Second hidden layer diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp index abaab34..a2aa5e3 100644 --- a/src/testNeuralLM.cpp +++ b/src/testNeuralLM.cpp @@ -20,7 +20,7 @@ using namespace Eigen; using namespace nplm; void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams, - vector<double> &out) { + vector<user_data_t> &out) { if (ngrams.size() == 0) return; int ngram_size = ngrams[0].size(); @@ -29,7 +29,7 @@ void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector< // Score one n-gram at a time. This is how the LM would be queried from a decoder. for (int sent_id=0; sent_id<start.size()-1; sent_id++) { - double sent_log_prob = 0.0; + user_data_t sent_log_prob = 0.0; for (int j=start[sent_id]; j<start[sent_id+1]; j++) sent_log_prob += lm.lookup_ngram(ngrams[j]); out.push_back(sent_log_prob); @@ -38,7 +38,7 @@ void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector< else { // Score a whole minibatch at a time. - Matrix<double,1,Dynamic> log_probs(ngrams.size()); + Matrix<user_data_t,1,Dynamic> log_probs(ngrams.size()); Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size); minibatch.setZero(); @@ -52,7 +52,7 @@ void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector< for (int sent_id=0; sent_id<start.size()-1; sent_id++) { - double sent_log_prob = 0.0; + user_data_t sent_log_prob = 0.0; for (int j=start[sent_id]; j<start[sent_id+1]; j++) sent_log_prob += log_probs[j]; out.push_back(sent_log_prob); @@ -157,7 +157,7 @@ int main (int argc, char *argv[]) start.push_back(ngrams.size()); int num_threads = 1; - vector< vector<double> > sent_log_probs(num_threads); + vector< vector<user_data_t> > sent_log_probs(num_threads); /* // Test thread safety @@ -169,7 +169,7 @@ int main (int argc, char *argv[]) */ score(lm, minibatch_size, start, ngrams, sent_log_probs[0]); - vector<double> log_likelihood(num_threads); + vector<user_data_t> log_likelihood(num_threads); std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0); for (int i=0; i<sent_log_probs[0].size(); i++) { for (int t=0; t<num_threads; t++) diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp index 58b9165..87e4a65 100644 --- a/src/testNeuralNetwork.cpp +++ b/src/testNeuralNetwork.cpp @@ -98,10 +98,10 @@ int main (int argc, char *argv[]) int num_batches = (test_data_size-1)/myParam.minibatch_size + 1; cerr<<"Number of test minibatches: "<<num_batches<<endl; - double log_likelihood = 0.0; + user_data_t log_likelihood = 0.0; - Matrix<double,Dynamic,Dynamic> scores(nn.output_vocab_size, myParam.minibatch_size); - Matrix<double,Dynamic,Dynamic> output_probs(nn.output_vocab_size, myParam.minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> scores(nn.output_vocab_size, myParam.minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> output_probs(nn.output_vocab_size, myParam.minibatch_size); for (int batch = 0; batch < num_batches; batch++) { @@ -122,7 +122,7 @@ int main (int argc, char *argv[]) // And softmax and loss - double minibatch_log_likelihood; + user_data_t minibatch_log_likelihood; SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), minibatch.row(myParam.ngram_size-1), output_probs, diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp index 43e87f1..632391a 100644 --- a/src/trainNeuralNetwork.cpp +++ b/src/trainNeuralNetwork.cpp @@ -40,7 +40,7 @@ using namespace boost::random; using namespace nplm; namespace ip = boost::interprocess; -typedef unordered_map<Matrix<int,Dynamic,1>, double> vector_map; +typedef unordered_map<Matrix<int,Dynamic,1>, user_data_t> vector_map; typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator; typedef ip::vector<int, intAllocator> vec; @@ -50,12 +50,12 @@ typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocato typedef long long int data_size_t; // training data can easily exceed 2G instances -void compute_validation_perplexity(int ngram_size, int output_vocab_size, int validation_minibatch_size, int validation_data_size, int num_validation_batches, param & myParam, propagator & prop_validation, Map< Matrix<int,Dynamic,Dynamic> > & validation_data, double & current_learning_rate, double & current_validation_ll) +void compute_validation_perplexity(int ngram_size, int output_vocab_size, int validation_minibatch_size, int validation_data_size, int num_validation_batches, param & myParam, propagator & prop_validation, Map< Matrix<int,Dynamic,Dynamic> > & validation_data, user_data_t & current_learning_rate, user_data_t & current_validation_ll) { - double log_likelihood = 0.0; + user_data_t log_likelihood = 0.0; - Matrix<double,Dynamic,Dynamic> scores(output_vocab_size, validation_minibatch_size); - Matrix<double,Dynamic,Dynamic> output_probs(output_vocab_size, validation_minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> scores(output_vocab_size, validation_minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> output_probs(output_vocab_size, validation_minibatch_size); Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, validation_minibatch_size); for (int validation_batch =0;validation_batch < num_validation_batches;validation_batch++) @@ -76,7 +76,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va stop_timer(4); // And softmax and loss. Be careful of short minibatch - double minibatch_log_likelihood; + user_data_t minibatch_log_likelihood; start_timer(5); SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), minibatch.row(ngram_size-1), @@ -113,11 +113,11 @@ int main(int argc, char** argv) ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd); - ValueArg<double> final_momentum("", "final_momentum", "Final value of momentum. Default: 0.9.", false, 0.9, "double", cmd); - ValueArg<double> initial_momentum("", "initial_momentum", "Initial value of momentum. Default: 0.9.", false, 0.9, "double", cmd); + ValueArg<user_data_t> final_momentum("", "final_momentum", "Final value of momentum. Default: 0.9.", false, 0.9, "user_data_t", cmd); + ValueArg<user_data_t> initial_momentum("", "initial_momentum", "Initial value of momentum. Default: 0.9.", false, 0.9, "user_data_t", cmd); ValueArg<bool> use_momentum("", "use_momentum", "Use momentum (hidden layer weights only). 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); - ValueArg<double> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "double", cmd); + ValueArg<user_data_t> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "user_data_t", cmd); ValueArg<bool> normalization("", "normalization", "Learn individual normalization factors during training. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd); ValueArg<bool> mmap_file("", "mmap_file", "Use memory mapped files. This is useful if the entire data cannot fit in memory. prepareNeuralLM can generate memory mapped files", false, 0, "bool", cmd); @@ -126,24 +126,24 @@ int main(int argc, char** argv) ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 100.", false, 100, "int", cmd); - ValueArg<double> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "double", cmd); + ValueArg<user_data_t> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "user_data_t", cmd); - ValueArg<double> input_dropout("", "input_dropout", "Probability of retaining input word. Values between 0 (all input is ignored) to 1 (no dropout). Default: 1.", false, 1, "double", cmd); + ValueArg<double> input_dropout("", "input_dropout", "Probability of retaining input word. Values between 0 (all input is ignored) to 1 (no dropout). Default: 1.", false, 1, "user_data_t", cmd); ValueArg<int> null_index("", "null_index", "Index of null word. Used as special (dropped out) token for input dropout.", false, 0, "int", cmd); - ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 1.", false, 1., "double", cmd); + ValueArg<user_data_t> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 1.", false, 1., "user_data_t", cmd); - ValueArg<double> conditioning_constant("", "conditioning_constant", "Constant to condition the RMS of the expected square of the gradient in ADADELTA. Default: 10E-3.", false, 10E-3, "double", cmd); + ValueArg<user_data_t> conditioning_constant("", "conditioning_constant", "Constant to condition the RMS of the expected square of the gradient in ADADELTA. Default: 10E-3.", false, 10E-3, "user_data_t", cmd); - ValueArg<double> decay("", "decay", "Decay for ADADELTA. Default: 0.95", false, 0.95, "double", cmd); - ValueArg<double> adagrad_epsilon("", "adagrad_epsilon", "Constant to initialize the L2 squared norm of the gradients with.\ - Default: 10E-3", false, 10E-3, "double", cmd); + ValueArg<user_data_t> decay("", "decay", "Decay for ADADELTA. Default: 0.95", false, 0.95, "user_data_t", cmd); + ValueArg<user_data_t> adagrad_epsilon("", "adagrad_epsilon", "Constant to initialize the L2 squared norm of the gradients with.\ + Default: 10E-3", false, 10E-3, "user_data_t", cmd); ValueArg<int> validation_minibatch_size("", "validation_minibatch_size", "Minibatch size for validation. Default: 64.", false, 64, "int", cmd); ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 1000.", false, 1000, "int", cmd); ValueArg<int> num_epochs("", "num_epochs", "Number of epochs. Default: 10.", false, 10, "int", cmd); - ValueArg<double> init_range("", "init_range", "Maximum (of uniform) or standard deviation (of normal) for initialization. Default: 0.01", false, 0.01, "double", cmd); + ValueArg<user_data_t> init_range("", "init_range", "Maximum (of uniform) or standard deviation (of normal) for initialization. Default: 0.01", false, 0.01, "user_data_t", cmd); ValueArg<bool> init_normal("", "init_normal", "Initialize parameters from a normal distribution. 1 = normal, 0 = uniform. Default: 0.", false, 0, "bool", cmd); ValueArg<string> loss_function("", "loss_function", "Loss function (log, nce). Default: nce.", false, "nce", "string", cmd); @@ -553,10 +553,10 @@ int main(int argc, char** argv) cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl; } - double current_momentum = myParam.initial_momentum; - double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1); - double current_learning_rate = myParam.learning_rate; - double current_validation_ll = 0.0; + user_data_t current_momentum = myParam.initial_momentum; + user_data_t momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1); + user_data_t current_learning_rate = myParam.learning_rate; + user_data_t current_validation_ll = 0.0; int ngram_size = myParam.ngram_size; int input_vocab_size = myParam.input_vocab_size; @@ -589,7 +589,7 @@ int main(int argc, char** argv) cerr << "Training minibatches: "; - double log_likelihood = 0.0; + user_data_t log_likelihood = 0.0; int num_samples = 0; if (loss_function == LogLoss) @@ -597,10 +597,10 @@ int main(int argc, char** argv) else if (loss_function == NCELoss) num_samples = 1+num_noise_samples; - Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size); Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size); - Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size); - Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> scores(num_samples, minibatch_size); + Matrix<user_data_t,Dynamic,Dynamic> probs(num_samples, minibatch_size); for(data_size_t batch=0;batch<num_batches;batch++) { @@ -640,7 +640,7 @@ int main(int argc, char** argv) minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size); } #endif - double adjusted_learning_rate = current_learning_rate/minibatch_size; + user_data_t adjusted_learning_rate = current_learning_rate/minibatch_size; //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl; /* @@ -701,7 +701,7 @@ int main(int argc, char** argv) } } - double minibatch_log_likelihood; + user_data_t minibatch_log_likelihood; start_timer(5); softmax_loss.fProp(scores.leftCols(current_minibatch_size), minibatch_samples, @@ -747,7 +747,7 @@ int main(int argc, char** argv) prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); stop_timer(4); - double minibatch_log_likelihood; + user_data_t minibatch_log_likelihood; start_timer(5); SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), minibatch.row(ngram_size-1), diff --git a/src/util.cpp b/src/util.cpp index f6a5779..825c1f6 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -147,7 +147,7 @@ void readDataFile(const string &filename, int &ngram_size, vector<int> &data, in DATAIN.close(); } -double logadd(double x, double y) +user_data_t logadd(user_data_t x, user_data_t y) { if (x > y) return x + log1p(std::exp(y-x)); @@ -20,6 +20,12 @@ #include "maybe_omp.h" +#ifdef NPLM_DOUBLE_PRECISION +typedef double user_data_t; +#else +typedef float user_data_t; +#endif + // Make matrices hashable namespace Eigen { @@ -86,7 +92,7 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra template <typename Derived> void initMatrix(boost::random::mt19937 &engine, const Eigen::MatrixBase<Derived> &p_const, - bool init_normal, double range) + bool init_normal, user_data_t range) { UNCONST(Derived, p_const, p); if (init_normal == 0) @@ -105,7 +111,7 @@ void initMatrix(boost::random::mt19937 &engine, else // initialize with gaussian distribution with mean 0 and stdev range { - boost::random::normal_distribution<double> unif_normal(0., range); + boost::random::normal_distribution<user_data_t> unif_normal(0., range); for (int i = 0; i < p.rows(); i++) { for (int j = 0; j < p.cols(); j++) @@ -119,7 +125,7 @@ void initMatrix(boost::random::mt19937 &engine, template <typename Derived> void initBias(boost::random::mt19937 &engine, const Eigen::MatrixBase<Derived> &p_const, - bool init_normal, double range) + bool init_normal, user_data_t range) { UNCONST(Derived, p_const, p); if (init_normal == 0) @@ -135,7 +141,7 @@ void initBias(boost::random::mt19937 &engine, else // initialize with gaussian distribution with mean 0 and stdev range { - boost::random::normal_distribution<double> unif_normal(0., range); + boost::random::normal_distribution<user_data_t> unif_normal(0., range); for (int i = 0; i < p.size(); i++) { p(i) = unif_normal(engine); @@ -234,11 +240,11 @@ void writeMatrix(const Eigen::MatrixBase<Derived> ¶m, std::ofstream &OUT) } template <typename Derived> -double logsum(const Eigen::MatrixBase<Derived> &v) +user_data_t logsum(const Eigen::MatrixBase<Derived> &v) { int mi; - double m = v.maxCoeff(&mi); - double logz = 0.0; + user_data_t m = v.maxCoeff(&mi); + user_data_t logz = 0.0; for (int i=0; i<v.rows(); i++) if (i != mi) logz += std::exp(v(i) - m); @@ -246,7 +252,7 @@ double logsum(const Eigen::MatrixBase<Derived> &v) return logz; } -double logadd(double x, double y); +user_data_t logadd(user_data_t x, user_data_t y); #ifdef USE_CHRONO class Timer |