diff options
author | graehl <graehl@gmail.com> | 2015-06-25 21:38:54 +0300 |
---|---|---|
committer | graehl <graehl@gmail.com> | 2015-06-25 22:13:53 +0300 |
commit | 363c73cacf94d965a8759ae8b55f56d8c1c29bb1 (patch) | |
tree | dd281ad350c508d766327988152a900acc851b95 | |
parent | 5fbf9611d24b6fd80c5839fe547f1edb141fa162 (diff) |
tab fix
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | src/model.cpp | 480 | ||||
-rw-r--r-- | src/neuralClasses.h | 1624 |
3 files changed, 1053 insertions, 1052 deletions
@@ -9,3 +9,4 @@ src/testNeuralLM src/testNeuralNetwork src/trainNeuralNetwork .history +src/make.sh diff --git a/src/model.cpp b/src/model.cpp index 919e005..db7f006 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -13,295 +13,295 @@ namespace nplm { void model::resize(int ngram_size, - int input_vocab_size, - int output_vocab_size, - int input_embedding_dimension, - int num_hidden, - int output_embedding_dimension) + int input_vocab_size, + int output_vocab_size, + int input_embedding_dimension, + int num_hidden, + int output_embedding_dimension) { - input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); - if (num_hidden == 0) - { - first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(output_embedding_dimension); - second_hidden_linear.resize(1,1); - second_hidden_activation.resize(1); - } - else - { - first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(num_hidden); - second_hidden_linear.resize(output_embedding_dimension, num_hidden); - second_hidden_activation.resize(output_embedding_dimension); - } - output_layer.resize(output_vocab_size, output_embedding_dimension); - this->ngram_size = ngram_size; - this->input_vocab_size = input_vocab_size; - this->output_vocab_size = output_vocab_size; - this->input_embedding_dimension = input_embedding_dimension; - this->num_hidden = num_hidden; - this->output_embedding_dimension = output_embedding_dimension; - premultiplied = false; + input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); + if (num_hidden == 0) + { + first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(output_embedding_dimension); + second_hidden_linear.resize(1,1); + second_hidden_activation.resize(1); + } + else + { + first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(num_hidden); + second_hidden_linear.resize(output_embedding_dimension, num_hidden); + second_hidden_activation.resize(output_embedding_dimension); + } + output_layer.resize(output_vocab_size, output_embedding_dimension); + this->ngram_size = ngram_size; + this->input_vocab_size = input_vocab_size; + this->output_vocab_size = output_vocab_size; + this->input_embedding_dimension = input_embedding_dimension; + this->num_hidden = num_hidden; + this->output_embedding_dimension = output_embedding_dimension; + premultiplied = false; } - + void model::initialize(boost::random::mt19937 &init_engine, - bool init_normal, - double init_range, - double init_bias, - string ¶meter_update, - double adagrad_epsilon) + bool init_normal, + double init_range, + double init_bias, + string ¶meter_update, + double adagrad_epsilon) { - input_layer.initialize(init_engine, - init_normal, - init_range, - parameter_update, - adagrad_epsilon); - output_layer.initialize(init_engine, - init_normal, - init_range, - init_bias, - parameter_update, - adagrad_epsilon); - first_hidden_linear.initialize(init_engine, - init_normal, - init_range, - parameter_update, - adagrad_epsilon); - second_hidden_linear.initialize(init_engine, - init_normal, - init_range, - parameter_update, - adagrad_epsilon); + input_layer.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); + output_layer.initialize(init_engine, + init_normal, + init_range, + init_bias, + parameter_update, + adagrad_epsilon); + first_hidden_linear.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); + second_hidden_linear.initialize(init_engine, + init_normal, + init_range, + parameter_update, + adagrad_epsilon); } void model::premultiply() { - // Since input and first_hidden_linear are both linear, - // we can multiply them into a single linear layer *if* we are not training - int context_size = ngram_size-1; - Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; - if (num_hidden == 0) - { - first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); - } - else - { - first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); - } - for (int i=0; i<context_size; i++) - first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); - input_layer.W->resize(1,1); // try to save some memory - premultiplied = true; + // Since input and first_hidden_linear are both linear, + // we can multiply them into a single linear layer *if* we are not training + int context_size = ngram_size-1; + Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; + if (num_hidden == 0) + { + first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); + } + else + { + first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + } + for (int i=0; i<context_size; i++) + first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); + input_layer.W->resize(1,1); // try to save some memory + premultiplied = true; } void model::readConfig(ifstream &config_file) { - string line; - vector<string> fields; - int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; - activation_function_type activation_function = this->activation_function; - while (getline(config_file, line) && line != "") + string line; + vector<string> fields; + int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; + activation_function_type activation_function = this->activation_function; + while (getline(config_file, line) && line != "") + { + splitBySpace(line, fields); + if (fields[0] == "ngram_size") + ngram_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "vocab_size") + input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "input_vocab_size") + input_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "output_vocab_size") + output_vocab_size = lexical_cast<int>(fields[1]); + else if (fields[0] == "input_embedding_dimension") + input_embedding_dimension = lexical_cast<int>(fields[1]); + else if (fields[0] == "num_hidden") + num_hidden = lexical_cast<int>(fields[1]); + else if (fields[0] == "output_embedding_dimension") + output_embedding_dimension = lexical_cast<int>(fields[1]); + else if (fields[0] == "activation_function") + activation_function = string_to_activation_function(fields[1]); + else if (fields[0] == "version") { - splitBySpace(line, fields); - if (fields[0] == "ngram_size") - ngram_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "vocab_size") - input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "input_vocab_size") - input_vocab_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "output_vocab_size") - output_vocab_size = lexical_cast<int>(fields[1]); - else if (fields[0] == "input_embedding_dimension") - input_embedding_dimension = lexical_cast<int>(fields[1]); - else if (fields[0] == "num_hidden") - num_hidden = lexical_cast<int>(fields[1]); - else if (fields[0] == "output_embedding_dimension") - output_embedding_dimension = lexical_cast<int>(fields[1]); - else if (fields[0] == "activation_function") - activation_function = string_to_activation_function(fields[1]); - else if (fields[0] == "version") - { - int version = lexical_cast<int>(fields[1]); - if (version != 1) - { - cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl; - exit(1); - } - } - else - cerr << "warning: unrecognized field in config: " << fields[0] << endl; + int version = lexical_cast<int>(fields[1]); + if (version != 1) + { + cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl; + exit(1); + } } - resize(ngram_size, - input_vocab_size, - output_vocab_size, - input_embedding_dimension, - num_hidden, - output_embedding_dimension); - set_activation_function(activation_function); + else + cerr << "warning: unrecognized field in config: " << fields[0] << endl; + } + resize(ngram_size, + input_vocab_size, + output_vocab_size, + input_embedding_dimension, + num_hidden, + output_embedding_dimension); + set_activation_function(activation_function); } void model::readConfig(const string &filename) { - ifstream config_file(filename.c_str()); - if (!config_file) - { - cerr << "error: could not open config file " << filename << endl; - exit(1); - } - readConfig(config_file); - config_file.close(); + ifstream config_file(filename.c_str()); + if (!config_file) + { + cerr << "error: could not open config file " << filename << endl; + exit(1); + } + readConfig(config_file); + config_file.close(); } - + void model::read(const string &filename) { - vector<string> input_words; - vector<string> output_words; - read(filename, input_words, output_words); + vector<string> input_words; + vector<string> output_words; + read(filename, input_words, output_words); } void model::read(const string &filename, vector<string> &words) { - vector<string> output_words; - read(filename, words, output_words); + vector<string> output_words; + read(filename, words, output_words); } void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words) { - ifstream file(filename.c_str()); - if (!file) throw runtime_error("Could not open file " + filename); - - param myParam; - string line; - - while (getline(file, line)) + ifstream file(filename.c_str()); + if (!file) throw runtime_error("Could not open file " + filename); + + param myParam; + string line; + + while (getline(file, line)) + { + if (line == "\\config") + { + readConfig(file); + } + + else if (line == "\\vocab") + { + input_words.clear(); + readWordsFile(file, input_words); + output_words = input_words; + } + + else if (line == "\\input_vocab") { - if (line == "\\config") - { - readConfig(file); - } - - else if (line == "\\vocab") - { - input_words.clear(); - readWordsFile(file, input_words); - output_words = input_words; - } - - else if (line == "\\input_vocab") - { - input_words.clear(); - readWordsFile(file, input_words); - } - - else if (line == "\\output_vocab") - { - output_words.clear(); - readWordsFile(file, output_words); - } - - else if (line == "\\input_embeddings") - input_layer.read(file); - else if (line == "\\hidden_weights 1") - first_hidden_linear.read_weights(file); - else if (line == "\\hidden_biases 1") - first_hidden_linear.read_biases (file); - else if (line == "\\hidden_weights 2") - second_hidden_linear.read_weights(file); - else if (line == "\\hidden_biases 2") - second_hidden_linear.read_biases (file); - else if (line == "\\output_weights") - output_layer.read_weights(file); - else if (line == "\\output_biases") - output_layer.read_biases(file); - else if (line == "\\end") - break; - else if (line == "") - continue; - else - { - cerr << "warning: unrecognized section: " << line << endl; - // skip over section - while (getline(file, line) && line != "") { } - } + input_words.clear(); + readWordsFile(file, input_words); } - file.close(); + + else if (line == "\\output_vocab") + { + output_words.clear(); + readWordsFile(file, output_words); + } + + else if (line == "\\input_embeddings") + input_layer.read(file); + else if (line == "\\hidden_weights 1") + first_hidden_linear.read_weights(file); + else if (line == "\\hidden_biases 1") + first_hidden_linear.read_biases (file); + else if (line == "\\hidden_weights 2") + second_hidden_linear.read_weights(file); + else if (line == "\\hidden_biases 2") + second_hidden_linear.read_biases (file); + else if (line == "\\output_weights") + output_layer.read_weights(file); + else if (line == "\\output_biases") + output_layer.read_biases(file); + else if (line == "\\end") + break; + else if (line == "") + continue; + else + { + cerr << "warning: unrecognized section: " << line << endl; + // skip over section + while (getline(file, line) && line != "") { } + } + } + file.close(); } void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words) -{ - write(filename, &input_words, &output_words); +{ + write(filename, &input_words, &output_words); } void model::write(const string &filename, const vector<string> &words) -{ - write(filename, &words, NULL); +{ + write(filename, &words, NULL); } -void model::write(const string &filename) -{ - write(filename, NULL, NULL); +void model::write(const string &filename) +{ + write(filename, NULL, NULL); } void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords) { - ofstream file(filename.c_str()); - if (!file) throw runtime_error("Could not open file " + filename); - - file << "\\config" << endl; - file << "version 1" << endl; - file << "ngram_size " << ngram_size << endl; - file << "input_vocab_size " << input_vocab_size << endl; - file << "output_vocab_size " << output_vocab_size << endl; - file << "input_embedding_dimension " << input_embedding_dimension << endl; - file << "num_hidden " << num_hidden << endl; - file << "output_embedding_dimension " << output_embedding_dimension << endl; - file << "activation_function " << activation_function_to_string(activation_function) << endl; - file << endl; - - if (input_pwords) - { - file << "\\input_vocab" << endl; - writeWordsFile(*input_pwords, file); - file << endl; - } + ofstream file(filename.c_str()); + if (!file) throw runtime_error("Could not open file " + filename); - if (output_pwords) - { - file << "\\output_vocab" << endl; - writeWordsFile(*output_pwords, file); - file << endl; - } + file << "\\config" << endl; + file << "version 1" << endl; + file << "ngram_size " << ngram_size << endl; + file << "input_vocab_size " << input_vocab_size << endl; + file << "output_vocab_size " << output_vocab_size << endl; + file << "input_embedding_dimension " << input_embedding_dimension << endl; + file << "num_hidden " << num_hidden << endl; + file << "output_embedding_dimension " << output_embedding_dimension << endl; + file << "activation_function " << activation_function_to_string(activation_function) << endl; + file << endl; - file << "\\input_embeddings" << endl; - input_layer.write(file); - file << endl; - - file << "\\hidden_weights 1" << endl; - first_hidden_linear.write_weights(file); + if (input_pwords) + { + file << "\\input_vocab" << endl; + writeWordsFile(*input_pwords, file); file << endl; + } - file << "\\hidden_biases 1" << endl; - first_hidden_linear.write_biases(file); - file <<endl; - - file << "\\hidden_weights 2" << endl; - second_hidden_linear.write_weights(file); + if (output_pwords) + { + file << "\\output_vocab" << endl; + writeWordsFile(*output_pwords, file); file << endl; + } - file << "\\hidden_biases 2" << endl; - second_hidden_linear.write_biases(file); - file << endl; - - file << "\\output_weights" << endl; - output_layer.write_weights(file); - file << endl; - - file << "\\output_biases" << endl; - output_layer.write_biases(file); - file << endl; - - file << "\\end" << endl; - file.close(); + file << "\\input_embeddings" << endl; + input_layer.write(file); + file << endl; + + file << "\\hidden_weights 1" << endl; + first_hidden_linear.write_weights(file); + file << endl; + + file << "\\hidden_biases 1" << endl; + first_hidden_linear.write_biases(file); + file <<endl; + + file << "\\hidden_weights 2" << endl; + second_hidden_linear.write_weights(file); + file << endl; + + file << "\\hidden_biases 2" << endl; + second_hidden_linear.write_biases(file); + file << endl; + + file << "\\output_weights" << endl; + output_layer.write_weights(file); + file << endl; + + file << "\\output_biases" << endl; + output_layer.write_biases(file); + file << endl; + + file << "\\end" << endl; + file.close(); } diff --git a/src/neuralClasses.h b/src/neuralClasses.h index ee7c3f0..7c86694 100644 --- a/src/neuralClasses.h +++ b/src/neuralClasses.h @@ -43,36 +43,36 @@ struct Clipper{ class Linear_layer { - private: - Matrix<double,Dynamic,Dynamic> U; - Matrix<double,Dynamic,Dynamic> U_gradient; - Matrix<double,Dynamic,Dynamic> U_velocity; - Matrix<double,Dynamic,Dynamic> U_running_gradient; - Matrix<double,Dynamic,Dynamic> U_running_parameter_update; - // Biases - Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,1> b_velocity; - Matrix<double,Dynamic,1> b_running_gradient; - Matrix<double,Dynamic,1> b_running_parameter_update; - Matrix<double,Dynamic,1> b_gradient; - - friend class model; - - public: + private: + Matrix<double,Dynamic,Dynamic> U; + Matrix<double,Dynamic,Dynamic> U_gradient; + Matrix<double,Dynamic,Dynamic> U_velocity; + Matrix<double,Dynamic,Dynamic> U_running_gradient; + Matrix<double,Dynamic,Dynamic> U_running_parameter_update; + // Biases + Matrix<double,Dynamic,1> b; + Matrix<double,Dynamic,1> b_velocity; + Matrix<double,Dynamic,1> b_running_gradient; + Matrix<double,Dynamic,1> b_running_parameter_update; + Matrix<double,Dynamic,1> b_gradient; + + friend class model; + + public: Linear_layer() { } - Linear_layer(int rows, int cols) { resize(rows, cols); } + Linear_layer(int rows, int cols) { resize(rows, cols); } void resize(int rows, int cols) { - U.setZero(rows, cols); - U_gradient.setZero(rows, cols); - //U_running_gradient.setZero(rows, cols); - //U_running_parameter_updates.setZero(rows, cols); - //U_velocity.setZero(rows, cols); - b.resize(rows); - b_gradient.setZero(rows); - //b_running_gradient.resize(rows); - //b_velocity.resize(rows); + U.setZero(rows, cols); + U_gradient.setZero(rows, cols); + //U_running_gradient.setZero(rows, cols); + //U_running_parameter_updates.setZero(rows, cols); + //U_velocity.setZero(rows, cols); + b.resize(rows); + b_gradient.setZero(rows); + //b_running_gradient.resize(rows); + //b_velocity.resize(rows); } void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); } @@ -83,24 +83,24 @@ class Linear_layer template <typename Engine> void initialize(Engine &engine, - bool init_normal, - double init_range, - string ¶meter_update, - double adagrad_epsilon) - { - if (parameter_update == "ADA") { - U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; - b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; - } - if (parameter_update == "ADAD") { - U_running_gradient.setZero(U.rows(),U.cols()); - b_running_gradient.setZero(b.size()); - U_running_parameter_update.setZero(U.rows(),U.cols()); - b_running_parameter_update.setZero(b.size()); - } + bool init_normal, + double init_range, + string ¶meter_update, + double adagrad_epsilon) + { + if (parameter_update == "ADA") { + U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon; + b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + } + if (parameter_update == "ADAD") { + U_running_gradient.setZero(U.rows(),U.cols()); + b_running_gradient.setZero(b.size()); + U_running_parameter_update.setZero(U.rows(),U.cols()); + b_running_parameter_update.setZero(b.size()); + } - initMatrix(engine, U, init_normal, init_range); - initBias(engine, b, init_normal, init_range); + initMatrix(engine, U, init_normal, init_range); + initBias(engine, b, init_normal, init_range); } int n_inputs () const { return U.cols(); } @@ -108,655 +108,655 @@ class Linear_layer template <typename DerivedIn, typename DerivedOut> void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const + const MatrixBase<DerivedOut> &output) const { - UNCONST(DerivedOut, output, my_output); - my_output.leftCols(input.cols()).noalias() = U*input; - int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) - { - my_output.leftCols(input.cols()).col(example) += b; - } + UNCONST(DerivedOut, output, my_output); + my_output.leftCols(input.cols()).noalias() = U*input; + int num_examples = input.cols(); + for (int example = 0;example < num_examples;example++) + { + my_output.leftCols(input.cols()).col(example) += b; + } } // Sparse input template <typename ScalarIn, typename DerivedOut> void fProp(const USCMatrix<ScalarIn> &input, - const MatrixBase<DerivedOut> &output_const) const - { - UNCONST(DerivedOut, output_const, output); - output.setZero(); - uscgemm(1.0, U, input, output.leftCols(input.cols())); - // Each column corresponds to a training example. We - // parallelize the adding of biases per dimension. - int num_examples = input.cols(); - for (int example = 0;example < num_examples;example++) - { - output.leftCols(input.cols()).col(example) += b; - } + const MatrixBase<DerivedOut> &output_const) const + { + UNCONST(DerivedOut, output_const, output); + output.setZero(); + uscgemm(1.0, U, input, output.leftCols(input.cols())); + // Each column corresponds to a training example. We + // parallelize the adding of biases per dimension. + int num_examples = input.cols(); + for (int example = 0;example < num_examples;example++) + { + output.leftCols(input.cols()).col(example) += b; + } } template <typename DerivedGOut, typename DerivedGIn> void bProp(const MatrixBase<DerivedGOut> &input, - MatrixBase<DerivedGIn> &output) const + MatrixBase<DerivedGIn> &output) const { - UNCONST(DerivedGIn, output, my_output); - my_output.noalias() = U.transpose()*input; + UNCONST(DerivedGIn, output, my_output); + my_output.noalias() = U.transpose()*input; } template <typename DerivedGOut, typename DerivedIn> void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, double momentum, double L2_reg) + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, double momentum, double L2_reg) { - U_gradient.noalias() = bProp_input*fProp_input.transpose(); + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - // get the bias gradient for all dimensions in parallel - int size = b.size(); - b_gradient = bProp_input.rowwise().sum(); - // This used to be multithreaded, but there was no measureable difference - if (L2_reg > 0.0) - { - U_gradient -= 2*L2_reg*U; - b_gradient -= 2*L2_reg*b; - } - if (momentum > 0.0) - { - U_velocity = momentum*U_velocity + U_gradient; - U += learning_rate * U_velocity; - b_velocity = momentum*b_velocity + b_gradient; - b += learning_rate * b_velocity; - } - else - { - U += learning_rate * U_gradient; - b += learning_rate * b_gradient; - /* - //UPDATE CLIPPING - U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); - b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); - //GRADIENT CLIPPING - //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix(); - //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); - */ - } + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient = bProp_input.rowwise().sum(); + // This used to be multithreaded, but there was no measureable difference + if (L2_reg > 0.0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } + if (momentum > 0.0) + { + U_velocity = momentum*U_velocity + U_gradient; + U += learning_rate * U_velocity; + b_velocity = momentum*b_velocity + b_gradient; + b += learning_rate * b_velocity; + } + else + { + U += learning_rate * U_gradient; + b += learning_rate * b_gradient; + /* + //UPDATE CLIPPING + U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix(); + b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix(); + //GRADIENT CLIPPING + //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix(); + //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix(); + */ + } } template <typename DerivedGOut, typename DerivedIn> void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, - double L2_reg) + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, + double L2_reg) { - U_gradient.noalias() = bProp_input*fProp_input.transpose(); + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - // get the bias gradient for all dimensions in parallel - int size = b.size(); - b_gradient.noalias() = bProp_input.rowwise().sum(); + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient.noalias() = bProp_input.rowwise().sum(); - if (L2_reg != 0) - { - U_gradient -= 2*L2_reg*U; - b_gradient -= 2*L2_reg*b; - } + if (L2_reg != 0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } - // ignore momentum? - #pragma omp parallel for - for (int col=0; col<U.cols(); col++) { - U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix(); - U.col(col) += learning_rate * (U_gradient.col(col).array() / - U_running_gradient.col(col).array().sqrt()).matrix(); - /* - //UPDATE CLIPPING - U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())). - unaryExpr(Clipper()).matrix(); - */ - } - b_running_gradient += b_gradient.array().square().matrix(); - b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + // ignore momentum? +#pragma omp parallel for + for (int col=0; col<U.cols(); col++) { + U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix(); + U.col(col) += learning_rate * (U_gradient.col(col).array() / + U_running_gradient.col(col).array().sqrt()).matrix(); /* //UPDATE CLIPPING - b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())). + unaryExpr(Clipper()).matrix(); */ + } + b_running_gradient += b_gradient.array().square().matrix(); + b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + */ } template <typename DerivedGOut, typename DerivedIn> void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - double learning_rate, - double L2_reg, - double conditioning_constant, - double decay) + const MatrixBase<DerivedIn> &fProp_input, + double learning_rate, + double L2_reg, + double conditioning_constant, + double decay) { - //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl; - U_gradient.noalias() = bProp_input*fProp_input.transpose(); + //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl; + U_gradient.noalias() = bProp_input*fProp_input.transpose(); - Array<double,Dynamic,1> b_current_parameter_update; + Array<double,Dynamic,1> b_current_parameter_update; - // get the bias gradient for all dimensions in parallel - int size = b.size(); - b_gradient.noalias() = bProp_input.rowwise().sum(); + // get the bias gradient for all dimensions in parallel + int size = b.size(); + b_gradient.noalias() = bProp_input.rowwise().sum(); - if (L2_reg != 0) - { - U_gradient -= 2*L2_reg*U; - b_gradient -= 2*L2_reg*b; - } + if (L2_reg != 0) + { + U_gradient -= 2*L2_reg*U; + b_gradient -= 2*L2_reg*b; + } - // ignore momentum? - #pragma omp parallel for - //cerr<<"U gradient is "<<U_gradient<<endl; - for (int col=0; col<U.cols(); col++) { - Array<double,Dynamic,1> U_current_parameter_update; - U_running_gradient.col(col) = decay*U_running_gradient.col(col) + - (1-decay)*U_gradient.col(col).array().square().matrix(); - //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; - //getchar(); - U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/ - (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) * - U_gradient.col(col).array(); - //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl; - //getchar(); - //update the running parameter update - U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) + - (1.-decay)*U_current_parameter_update.square().matrix(); - U.col(col) += learning_rate*U_current_parameter_update.matrix(); - } - b_running_gradient = decay*b_running_gradient + - (1.-decay)*b_gradient.array().square().matrix(); - b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ - (b_running_gradient.array()+conditioning_constant).sqrt()) * - b_gradient.array(); - b_running_parameter_update = decay*(b_running_parameter_update) + - (1.-decay)*b_current_parameter_update.square().matrix(); - b += learning_rate*b_current_parameter_update.matrix(); + // ignore momentum? +#pragma omp parallel for + //cerr<<"U gradient is "<<U_gradient<<endl; + for (int col=0; col<U.cols(); col++) { + Array<double,Dynamic,1> U_current_parameter_update; + U_running_gradient.col(col) = decay*U_running_gradient.col(col) + + (1-decay)*U_gradient.col(col).array().square().matrix(); + //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl; + //getchar(); + U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/ + (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) * + U_gradient.col(col).array(); + //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl; + //getchar(); + //update the running parameter update + U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) + + (1.-decay)*U_current_parameter_update.square().matrix(); + U.col(col) += learning_rate*U_current_parameter_update.matrix(); + } + b_running_gradient = decay*b_running_gradient + + (1.-decay)*b_gradient.array().square().matrix(); + b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ + (b_running_gradient.array()+conditioning_constant).sqrt()) * + b_gradient.array(); + b_running_parameter_update = decay*(b_running_parameter_update) + + (1.-decay)*b_current_parameter_update.square().matrix(); + b += learning_rate*b_current_parameter_update.matrix(); } template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &fProp_input, - const MatrixBase<DerivedGW> &gradient) const + const MatrixBase<DerivedIn> &fProp_input, + const MatrixBase<DerivedGW> &gradient) const { - UNCONST(DerivedGW, gradient, my_gradient); - my_gradient.noalias() = bProp_input*fProp_input.transpose(); + UNCONST(DerivedGW, gradient, my_gradient); + my_gradient.noalias() = bProp_input*fProp_input.transpose(); } }; class Output_word_embeddings { - private: - // row-major is better for uscgemm - //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W; - // Having W be a pointer to a matrix allows ease of sharing - // input and output word embeddings - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; - std::vector<double> W_data; - Matrix<double,Dynamic,1> b; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; - Matrix<double,Dynamic,1> b_running_gradient; - Matrix<double,Dynamic,1> b_gradient; - Matrix<double,Dynamic,1> b_running_parameter_update; - - public: - Output_word_embeddings() { } - Output_word_embeddings(int rows, int cols) { resize(rows, cols); } - - void resize(int rows, int cols) - { - W->setZero(rows, cols); - b.setZero(rows); - } - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { - W = input_W; - } - void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); } - void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); } - void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } - void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } - - template <typename Engine> - void initialize(Engine &engine, - bool init_normal, - double init_range, - double init_bias, - string ¶meter_update, - double adagrad_epsilon) - { + private: + // row-major is better for uscgemm + //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W; + // Having W be a pointer to a matrix allows ease of sharing + // input and output word embeddings + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + std::vector<double> W_data; + Matrix<double,Dynamic,1> b; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<double,Dynamic,1> b_running_gradient; + Matrix<double,Dynamic,1> b_gradient; + Matrix<double,Dynamic,1> b_running_parameter_update; + + public: + Output_word_embeddings() { } + Output_word_embeddings(int rows, int cols) { resize(rows, cols); } - W_gradient.setZero(W->rows(),W->cols()); - b_gradient.setZero(b.size()); - if (parameter_update == "ADA") { - W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; - b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; - //W_gradient.setZero(W->rows(),W->cols()); - //b_gradient.setZero(b.size()); - } - if (parameter_update == "ADAD") { - W_running_gradient.setZero(W->rows(),W->cols()); - b_running_gradient.setZero(b.size()); - W_gradient.setZero(W->rows(),W->cols()); - //b_gradient.setZero(b.size()); - //W_running_parameter_update.setZero(W->rows(),W->cols()); - b_running_parameter_update.setZero(b.size()); - } - - initMatrix(engine, *W, init_normal, init_range); - b.fill(init_bias); - } - - int n_inputs () const { return W->cols(); } - int n_outputs () const { return W->rows(); } - - template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const - { - UNCONST(DerivedOut, output, my_output); - my_output = ((*W) * input).colwise() + b; + void resize(int rows, int cols) + { + W->setZero(rows, cols); + b.setZero(rows); + } + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } + void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); } + void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); } + void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); } + void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); } + + template <typename Engine> + void initialize(Engine &engine, + bool init_normal, + double init_range, + double init_bias, + string ¶meter_update, + double adagrad_epsilon) + { + + W_gradient.setZero(W->rows(),W->cols()); + b_gradient.setZero(b.size()); + if (parameter_update == "ADA") { + W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon; + //W_gradient.setZero(W->rows(),W->cols()); + //b_gradient.setZero(b.size()); + } + if (parameter_update == "ADAD") { + W_running_gradient.setZero(W->rows(),W->cols()); + b_running_gradient.setZero(b.size()); + W_gradient.setZero(W->rows(),W->cols()); + //b_gradient.setZero(b.size()); + //W_running_parameter_update.setZero(W->rows(),W->cols()); + b_running_parameter_update.setZero(b.size()); } + initMatrix(engine, *W, init_normal, init_range); + b.fill(init_bias); + } + + int n_inputs () const { return W->cols(); } + int n_outputs () const { return W->rows(); } + + template <typename DerivedIn, typename DerivedOut> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + UNCONST(DerivedOut, output, my_output); + my_output = ((*W) * input).colwise() + b; + } + // Sparse output version - template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOutI> &samples, - const MatrixBase<DerivedOutV> &output) const - { - UNCONST(DerivedOutV, output, my_output); - #pragma omp parallel for - for (int instance_id = 0; instance_id < samples.cols(); instance_id++) - { - for (int sample_id = 0; sample_id < samples.rows(); sample_id++) - { - my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); - } - } - USCMatrix<double> sparse_output(W->rows(), samples, my_output); - uscgemm_masked(1.0, *W, input, sparse_output); - my_output = sparse_output.values; // too bad, so much copying - } - - // Return single element of output matrix - template <typename DerivedIn> - double fProp(const MatrixBase<DerivedIn> &input, - int word, - int instance) const + template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV> + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOutI> &samples, + const MatrixBase<DerivedOutV> &output) const + { + UNCONST(DerivedOutV, output, my_output); +#pragma omp parallel for + for (int instance_id = 0; instance_id < samples.cols(); instance_id++) { - return W->row(word).dot(input.col(instance)) + b(word); + for (int sample_id = 0; sample_id < samples.rows(); sample_id++) + { + my_output(sample_id, instance_id) = b(samples(sample_id, instance_id)); + } } + USCMatrix<double> sparse_output(W->rows(), samples, my_output); + uscgemm_masked(1.0, *W, input, sparse_output); + my_output = sparse_output.values; // too bad, so much copying + } - // Dense versions (for log-likelihood loss) + // Return single element of output matrix + template <typename DerivedIn> + double fProp(const MatrixBase<DerivedIn> &input, + int word, + int instance) const + { + return W->row(word).dot(input.col(instance)) + b(word); + } - template <typename DerivedGOut, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix, - const MatrixBase<DerivedGIn> &bProp_matrix) const - { - // W is vocab_size x output_embedding_dimension - // input_bProp_matrix is vocab_size x minibatch_size - // bProp_matrix is output_embedding_dimension x minibatch_size - UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); - my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = + // Dense versions (for log-likelihood loss) + + template <typename DerivedGOut, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix, + const MatrixBase<DerivedGIn> &bProp_matrix) const + { + // W is vocab_size x output_embedding_dimension + // input_bProp_matrix is vocab_size x minibatch_size + // bProp_matrix is output_embedding_dimension x minibatch_size + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() = W->transpose() * input_bProp_matrix; - } + } - template <typename DerivedIn, typename DerivedGOut> - void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate, - double momentum) //not sure if we want to use momentum here - { - // W is vocab_size x output_embedding_dimension - // b is vocab_size x 1 - // predicted_embeddings is output_embedding_dimension x minibatch_size - // bProp_input is vocab_size x minibatch_size - W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose(); - b += learning_rate * bProp_input.rowwise().sum(); - - /* - //GRADIENT CLIPPING - W->noalias() += learning_rate * - ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); - b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); - //UPDATE CLIPPING - W->noalias() += (learning_rate * - (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); - b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); - */ - } - - template <typename DerivedIn, typename DerivedGOut> - void computeGradientAdagrad( - const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate) //not sure if we want to use momentum here - { - // W is vocab_size x output_embedding_dimension - // b is vocab_size x 1 - // predicted_embeddings is output_embedding_dimension x minibatch_size - // bProp_input is vocab_size x minibatch_sizea - W_gradient.setZero(W->rows(), W->cols()); - b_gradient.setZero(b.size()); - W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); - b_gradient.noalias() = bProp_input.rowwise().sum(); - W_running_gradient += W_gradient.array().square().matrix(); - b_running_gradient += b_gradient.array().square().matrix(); - W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix(); - b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); - /* - //UPDATE CLIPPING - *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); - b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); - */ - } - - template <typename DerivedIn, typename DerivedGOut> - void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOut> &bProp_input, - double learning_rate, - double conditioning_constant, - double decay) //not sure if we want to use momentum here - { - // W is vocab_size x output_embedding_dimension - // b is vocab_size x 1 - // predicted_embeddings is output_embedding_dimension x minibatch_size - // bProp_input is vocab_size x minibatch_size - Array<double,Dynamic,Dynamic> W_current_parameter_update; - Array<double,Dynamic,1> b_current_parameter_update; - W_gradient.setZero(W->rows(), W->cols()); - b_gradient.setZero(b.size()); - W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); - b_gradient.noalias() = bProp_input.rowwise().sum(); - W_running_gradient = decay*W_running_gradient + - (1.-decay)*W_gradient.array().square().matrix(); - b_running_gradient = decay*b_running_gradient+ - (1.-decay)*b_gradient.array().square().matrix(); - W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/ - (W_running_gradient.array()+conditioning_constant).sqrt())* - W_gradient.array(); - b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ - (b_running_gradient.array()+conditioning_constant).sqrt())* - b_gradient.array(); - W_running_parameter_update = decay*W_running_parameter_update + - (1.-decay)*W_current_parameter_update.square().matrix(); - b_running_parameter_update = decay*b_running_parameter_update + - (1.-decay)*b_current_parameter_update.square().matrix(); - - *W += learning_rate*W_current_parameter_update.matrix(); - b += learning_rate*b_current_parameter_update.matrix(); - } - - // Sparse versions - - template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn> - void bProp(const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - const MatrixBase<DerivedGIn> &bProp_matrix) const - { - UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); - my_bProp_matrix.setZero(); - uscgemm(1.0, + template <typename DerivedIn, typename DerivedGOut> + void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate, + double momentum) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_size + W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose(); + b += learning_rate * bProp_input.rowwise().sum(); + + /* + //GRADIENT CLIPPING + W->noalias() += learning_rate * + ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix(); + b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix(); + //UPDATE CLIPPING + W->noalias() += (learning_rate * + (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix(); + b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix(); + */ + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradientAdagrad( + const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_sizea + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + W_running_gradient += W_gradient.array().square().matrix(); + b_running_gradient += b_gradient.array().square().matrix(); + W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix(); + b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix(); + /* + //UPDATE CLIPPING + *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix(); + */ + } + + template <typename DerivedIn, typename DerivedGOut> + void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOut> &bProp_input, + double learning_rate, + double conditioning_constant, + double decay) //not sure if we want to use momentum here + { + // W is vocab_size x output_embedding_dimension + // b is vocab_size x 1 + // predicted_embeddings is output_embedding_dimension x minibatch_size + // bProp_input is vocab_size x minibatch_size + Array<double,Dynamic,Dynamic> W_current_parameter_update; + Array<double,Dynamic,1> b_current_parameter_update; + W_gradient.setZero(W->rows(), W->cols()); + b_gradient.setZero(b.size()); + W_gradient.noalias() = bProp_input * predicted_embeddings.transpose(); + b_gradient.noalias() = bProp_input.rowwise().sum(); + W_running_gradient = decay*W_running_gradient + + (1.-decay)*W_gradient.array().square().matrix(); + b_running_gradient = decay*b_running_gradient+ + (1.-decay)*b_gradient.array().square().matrix(); + W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/ + (W_running_gradient.array()+conditioning_constant).sqrt())* + W_gradient.array(); + b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/ + (b_running_gradient.array()+conditioning_constant).sqrt())* + b_gradient.array(); + W_running_parameter_update = decay*W_running_parameter_update + + (1.-decay)*W_current_parameter_update.square().matrix(); + b_running_parameter_update = decay*b_running_parameter_update + + (1.-decay)*b_current_parameter_update.square().matrix(); + + *W += learning_rate*W_current_parameter_update.matrix(); + b += learning_rate*b_current_parameter_update.matrix(); + } + + // Sparse versions + + template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn> + void bProp(const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + const MatrixBase<DerivedGIn> &bProp_matrix) const + { + UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix); + my_bProp_matrix.setZero(); + uscgemm(1.0, W->transpose(), USCMatrix<double>(W->rows(), samples, weights), my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch - } + } template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> - void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate, double momentum) //not sure if we want to use momentum here + void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, double momentum) //not sure if we want to use momentum here { - //cerr<<"in gradient"<<endl; - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(learning_rate, - gradient_output, - predicted_embeddings.leftCols(gradient_output.cols()).transpose(), - *W); // narrow predicted_embeddings for possible short minibatch - uscgemv(learning_rate, - gradient_output, - Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), - b); - /* - //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT - //FIRST - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - W_gradient); - uscgemv(1.0, - gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), - b_gradient); - - int_map update_map; //stores all the parameters that have been updated - for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; - - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - update_items.push_back(it->first); - int num_items = update_items.size(); - - //#pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - //W->row(update_item) += learning_rate * W_gradient.row(update_item); - //b(update_item) += learning_rate * b_gradient(update_item); - //UPDATE CLIPPING - W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix(); - double update = learning_rate * b_gradient(update_item); - b(update_item) += std::min(0.5, std::max(update,-0.5)); - //GRADIENT CLIPPING - W_gradient.row(update_item).setZero(); - b_gradient(update_item) = 0.; - } - */ - //cerr<<"Finished gradient"<<endl; + //cerr<<"in gradient"<<endl; + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(learning_rate, + gradient_output, + predicted_embeddings.leftCols(gradient_output.cols()).transpose(), + *W); // narrow predicted_embeddings for possible short minibatch + uscgemv(learning_rate, + gradient_output, + Matrix<double,Dynamic,1>::Ones(gradient_output.cols()), + b); + /* + //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT + //FIRST + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + //#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + //W->row(update_item) += learning_rate * W_gradient.row(update_item); + //b(update_item) += learning_rate * b_gradient(update_item); + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix(); + double update = learning_rate * b_gradient(update_item); + b(update_item) += std::min(0.5, std::max(update,-0.5)); + //GRADIENT CLIPPING + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; + } + */ + //cerr<<"Finished gradient"<<endl; } template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> - void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate) //not sure if we want to use momentum here - { - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); - //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - W_gradient); - uscgemv(1.0, - gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), - b_gradient); - - int_map update_map; //stores all the parameters that have been updated - for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; - - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - update_items.push_back(it->first); - int num_items = update_items.size(); - - //#pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); - b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item); - W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); - b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); - /* - //UPDATE CLIPPING - W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix(); - double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); - b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5)); - */ - W_gradient.row(update_item).setZero(); - b_gradient(update_item) = 0.; - } - } + void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate) //not sure if we want to use momentum here + { + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); + //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + + //#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); + b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item); + W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); + b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + /* + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix(); + double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item)); + b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5)); + */ + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; + } + } template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV> - void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - double learning_rate, - double conditioning_constant, - double decay) //not sure if we want to use momentum here - { - //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; - //W_gradient.setZero(W->rows(), W->cols()); - //b_gradient.setZero(b.size()); - - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - W_gradient); - uscgemv(1.0, - gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), - b_gradient); - - int_map update_map; //stores all the parameters that have been updated - for (int sample_id=0; sample_id<samples.rows(); sample_id++) - for (int train_id=0; train_id<samples.cols(); train_id++) - update_map[samples(sample_id, train_id)] = 1; - - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - update_items.push_back(it->first); - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - Array<double,1,Dynamic> W_current_parameter_update; - double b_current_parameter_update; - - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ - (1.-decay)*W_gradient.row(update_item).array().square().matrix(); - b_running_gradient(update_item) = decay*b_running_gradient(update_item)+ - (1.-decay)*b_gradient(update_item)*b_gradient(update_item); - //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl; - //getchar(); - - //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl; - //getchar(); - W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ - (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* - W_gradient.row(update_item).array(); - b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/ - sqrt(b_running_gradient(update_item)+conditioning_constant))* - b_gradient(update_item); - //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl; - //getchar(); - //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl; - //getchar(); - //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl; - W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ - (1.-decay)*(W_current_parameter_update.square().matrix()); - b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+ - (1.-decay)*b_current_parameter_update*b_current_parameter_update; - //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl; - //getchar(); - W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); - b(update_item) += learning_rate*b_current_parameter_update; - W_gradient.row(update_item).setZero(); - b_gradient(update_item) = 0.; - } - } + void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + double learning_rate, + double conditioning_constant, + double decay) //not sure if we want to use momentum here + { + //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl; + //W_gradient.setZero(W->rows(), W->cols()); + //b_gradient.setZero(b.size()); + + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + W_gradient); + uscgemv(1.0, + gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), + b_gradient); + + int_map update_map; //stores all the parameters that have been updated + for (int sample_id=0; sample_id<samples.rows(); sample_id++) + for (int train_id=0; train_id<samples.cols(); train_id++) + update_map[samples(sample_id, train_id)] = 1; + + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + update_items.push_back(it->first); + int num_items = update_items.size(); + +#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + Array<double,1,Dynamic> W_current_parameter_update; + double b_current_parameter_update; + + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ + (1.-decay)*W_gradient.row(update_item).array().square().matrix(); + b_running_gradient(update_item) = decay*b_running_gradient(update_item)+ + (1.-decay)*b_gradient(update_item)*b_gradient(update_item); + //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl; + //getchar(); + + //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl; + //getchar(); + W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ + (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* + W_gradient.row(update_item).array(); + b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/ + sqrt(b_running_gradient(update_item)+conditioning_constant))* + b_gradient(update_item); + //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl; + //getchar(); + //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl; + //getchar(); + //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl; + W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ + (1.-decay)*(W_current_parameter_update.square().matrix()); + b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+ + (1.-decay)*b_current_parameter_update*b_current_parameter_update; + //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl; + //getchar(); + W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); + b(update_item) += learning_rate*b_current_parameter_update; + W_gradient.row(update_item).setZero(); + b_gradient(update_item) = 0.; + } + } template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb> - void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, - const MatrixBase<DerivedGOutI> &samples, - const MatrixBase<DerivedGOutV> &weights, - const MatrixBase<DerivedGW> &gradient_W, - const MatrixBase<DerivedGb> &gradient_b) const - { - UNCONST(DerivedGW, gradient_W, my_gradient_W); - UNCONST(DerivedGb, gradient_b, my_gradient_b); - my_gradient_W.setZero(); - my_gradient_b.setZero(); - USCMatrix<double> gradient_output(W->rows(), samples, weights); - uscgemm(1.0, - gradient_output, - predicted_embeddings.leftCols(samples.cols()).transpose(), - my_gradient_W); - uscgemv(1.0, gradient_output, - Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); + void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings, + const MatrixBase<DerivedGOutI> &samples, + const MatrixBase<DerivedGOutV> &weights, + const MatrixBase<DerivedGW> &gradient_W, + const MatrixBase<DerivedGb> &gradient_b) const + { + UNCONST(DerivedGW, gradient_W, my_gradient_W); + UNCONST(DerivedGb, gradient_b, my_gradient_b); + my_gradient_W.setZero(); + my_gradient_b.setZero(); + USCMatrix<double> gradient_output(W->rows(), samples, weights); + uscgemm(1.0, + gradient_output, + predicted_embeddings.leftCols(samples.cols()).transpose(), + my_gradient_W); + uscgemv(1.0, gradient_output, + Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b); } }; class Input_word_embeddings { - private: - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; - int context_size, vocab_size; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; - Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; + private: + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W; + int context_size, vocab_size; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update; + Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient; friend class model; - public: - Input_word_embeddings() : context_size(0), vocab_size(0) { } - Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } + public: + Input_word_embeddings() : context_size(0), vocab_size(0) { } + Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); } - void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { - W = input_W; - } + void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) { + W = input_W; + } - void resize(int rows, int cols, int context) - { - context_size = context; - vocab_size = rows; - W->setZero(rows, cols); - } + void resize(int rows, int cols, int context) + { + context_size = context; + vocab_size = rows; + W->setZero(rows, cols); + } - void read(std::ifstream &W_file) { readMatrix(W_file, *W); } - void write(std::ofstream &W_file) { writeMatrix(*W, W_file); } + void read(std::ifstream &W_file) { readMatrix(W_file, *W); } + void write(std::ofstream &W_file) { writeMatrix(*W, W_file); } - template <typename Engine> - void initialize(Engine &engine, - bool init_normal, - double init_range, - string ¶meter_update, - double adagrad_epsilon) - { - W_gradient.setZero(W->rows(),W->cols()); - - if (parameter_update == "ADA") { - W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; - //W_gradient.setZero(W->rows(),W->cols()); - } - if (parameter_update == "ADAD") { - W_running_gradient.setZero(W->rows(),W->cols()); - //W_gradient.setZero(W->rows(),W->cols()); - W_running_parameter_update.setZero(W->rows(),W->cols()); - } - initMatrix(engine, - *W, - init_normal, - init_range); - } + template <typename Engine> + void initialize(Engine &engine, + bool init_normal, + double init_range, + string ¶meter_update, + double adagrad_epsilon) + { + W_gradient.setZero(W->rows(),W->cols()); + + if (parameter_update == "ADA") { + W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon; + //W_gradient.setZero(W->rows(),W->cols()); + } + if (parameter_update == "ADAD") { + W_running_gradient.setZero(W->rows(),W->cols()); + //W_gradient.setZero(W->rows(),W->cols()); + W_running_parameter_update.setZero(W->rows(),W->cols()); + } + initMatrix(engine, + *W, + init_normal, + init_range); + } int n_inputs() const { return -1; } int n_outputs() const { return W->cols() * context_size; } @@ -765,40 +765,40 @@ class Input_word_embeddings template <typename Dist> void average(const Dist &dist, int output_id) { - W->row(output_id).setZero(); - for (int i=0; i < W->rows(); i++) - if (i != output_id) + W->row(output_id).setZero(); + for (int i=0; i < W->rows(); i++) + if (i != output_id) W->row(output_id) += dist.prob(i) * W->row(i); } template <typename DerivedIn, typename DerivedOut> - void fProp(const MatrixBase<DerivedIn> &input, - const MatrixBase<DerivedOut> &output) const - { - int embedding_dimension = W->cols(); + void fProp(const MatrixBase<DerivedIn> &input, + const MatrixBase<DerivedOut> &output) const + { + int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // output is ngram_size*embedding_dimension x minibatch_size + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // output is ngram_size*embedding_dimension x minibatch_size - /* - // Dense version: - for (int ngram=0; ngram<context_size; ngram++) - output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); - */ + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size); + */ - UNCONST(DerivedOut, output, my_output); - my_output.setZero(); - for (int ngram=0; ngram<context_size; ngram++) - { - // input might be narrower than expected due to a short minibatch, - // so narrow output to match - uscgemm(1.0, - W->transpose(), - USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), - my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); - } - } + UNCONST(DerivedOut, output, my_output); + my_output.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + { + // input might be narrower than expected due to a short minibatch, + // so narrow output to match + uscgemm(1.0, + W->transpose(), + USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())), + my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols())); + } + } // When model is premultiplied, this layer doesn't get used, // but this method is used to get the input into a sparse matrix. @@ -814,206 +814,206 @@ class Input_word_embeddings template <typename DerivedGOut, typename DerivedIn> void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, double momentum, double L2_reg) + const MatrixBase<DerivedIn> &input_words, + double learning_rate, double momentum, double L2_reg) { - int embedding_dimension = W->cols(); + int embedding_dimension = W->cols(); - // W is vocab_size x embedding_dimension - // input is ngram_size*vocab_size x minibatch_size - // bProp_input is ngram_size*embedding_dimension x minibatch_size + // W is vocab_size x embedding_dimension + // input is ngram_size*vocab_size x minibatch_size + // bProp_input is ngram_size*embedding_dimension x minibatch_size - /* - // Dense version: - for (int ngram=0; ngram<context_size; ngram++) - W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() - */ + /* + // Dense version: + for (int ngram=0; ngram<context_size; ngram++) + W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose() + */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(learning_rate, - USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), - *W); - } + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(learning_rate, + USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(), + *W); + } - /* - //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN - //PERFORM CLIPPING WHILE UPDATING + /* + //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN + //PERFORM CLIPPING WHILE UPDATING - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - W_gradient); - } - int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) - { - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(ngram,train_id)] = 1; - } - } + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) + { + update_map[input_words(ngram,train_id)] = 1; + } + } - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - //UPDATE CLIPPING - W->row(update_item) += (learning_rate* - W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix(); - //GRADIENT CLIPPING - //W->row(update_item) += learning_rate* - // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix(); - //SETTING THE GRADIENT TO ZERO - W_gradient.row(update_item).setZero(); - } - */ - } + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) + { + update_items.push_back(it->first); + } + int num_items = update_items.size(); - template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, - double L2_reg) + #pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) { - int embedding_dimension = W->cols(); - //W_gradient.setZero(W->rows(), W->cols()); - /* + int update_item = update_items[item_id]; + //UPDATE CLIPPING + W->row(update_item) += (learning_rate* + W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix(); + //GRADIENT CLIPPING + //W->row(update_item) += learning_rate* + // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix(); + //SETTING THE GRADIENT TO ZERO + W_gradient.row(update_item).setZero(); + } + */ + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, + double L2_reg) + { + int embedding_dimension = W->cols(); + //W_gradient.setZero(W->rows(), W->cols()); + /* if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) - W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; - */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - W_gradient); - } - int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) + W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + */ + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) { - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(ngram,train_id)] = 1; - } + update_map[input_words(ngram,train_id)] = 1; } + } - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); - W->row(update_item) += learning_rate * - (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); - /* - //UPDATE CLIPPING - W->row(update_item) += (learning_rate * - (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) - .unaryExpr(Clipper()).matrix(); - */ - W_gradient.row(update_item).setZero(); - } - } - - template <typename DerivedGOut, typename DerivedIn> - void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - double learning_rate, - double L2_reg, - double conditioning_constant, - double decay) + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { - int embedding_dimension = W->cols(); + update_items.push_back(it->first); + } + int num_items = update_items.size(); - //W_gradient.setZero(W->rows(), W->cols()); +#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix(); + W->row(update_item) += learning_rate * + (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix(); /* - if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) - W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + //UPDATE CLIPPING + W->row(update_item) += (learning_rate * + (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())) + .unaryExpr(Clipper()).matrix(); */ - for (int ngram=0; ngram<context_size; ngram++) - { - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - W_gradient); - } - int_map update_map; //stores all the parameters that have been updated - for (int ngram=0; ngram<context_size; ngram++) + W_gradient.row(update_item).setZero(); + } + } + + template <typename DerivedGOut, typename DerivedIn> + void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + double learning_rate, + double L2_reg, + double conditioning_constant, + double decay) + { + int embedding_dimension = W->cols(); + + //W_gradient.setZero(W->rows(), W->cols()); + /* + if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols()) + W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon; + */ + for (int ngram=0; ngram<context_size; ngram++) + { + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + W_gradient); + } + int_map update_map; //stores all the parameters that have been updated + for (int ngram=0; ngram<context_size; ngram++) + { + for (int train_id=0; train_id<input_words.cols(); train_id++) { - for (int train_id=0; train_id<input_words.cols(); train_id++) - { - update_map[input_words(ngram,train_id)] = 1; - } + update_map[input_words(ngram,train_id)] = 1; } + } - // Convert to std::vector for parallelization - std::vector<int> update_items; - for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) - { - update_items.push_back(it->first); - } - int num_items = update_items.size(); - - #pragma omp parallel for - for (int item_id=0; item_id<num_items; item_id++) - { - - Array<double,1,Dynamic> W_current_parameter_update; - int update_item = update_items[item_id]; - W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ - (1.-decay)*W_gradient.row(update_item).array().square().matrix(); - - W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ - (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* - W_gradient.row(update_item).array(); - - //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl; - //getchar(); - W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ - (1.-decay)*W_current_parameter_update.square().matrix(); - - W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); - //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl; - //getchar(); - W_gradient.row(update_item).setZero(); - } - - } - - template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> - void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, - const MatrixBase<DerivedIn> &input_words, - int x, int minibatch_size, - const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + // Convert to std::vector for parallelization + std::vector<int> update_items; + for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it) { - UNCONST(DerivedGW, gradient, my_gradient); - int embedding_dimension = W->cols(); - my_gradient.setZero(); - for (int ngram=0; ngram<context_size; ngram++) - uscgemm(1.0, - USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), - bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), - my_gradient); + update_items.push_back(it->first); } + int num_items = update_items.size(); + +#pragma omp parallel for + for (int item_id=0; item_id<num_items; item_id++) + { + + Array<double,1,Dynamic> W_current_parameter_update; + int update_item = update_items[item_id]; + W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+ + (1.-decay)*W_gradient.row(update_item).array().square().matrix(); + + W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/ + (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())* + W_gradient.row(update_item).array(); + + //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl; + //getchar(); + W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+ + (1.-decay)*W_current_parameter_update.square().matrix(); + + W->row(update_item) += learning_rate*W_current_parameter_update.matrix(); + //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl; + //getchar(); + W_gradient.row(update_item).setZero(); + } + + } + + template <typename DerivedGOut, typename DerivedIn, typename DerivedGW> + void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, + const MatrixBase<DerivedIn> &input_words, + int x, int minibatch_size, + const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here + { + UNCONST(DerivedGW, gradient, my_gradient); + int embedding_dimension = W->cols(); + my_gradient.setZero(); + for (int ngram=0; ngram<context_size; ngram++) + uscgemm(1.0, + USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())), + bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(), + my_gradient); + } }; } // namespace nplm |