Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@gmail.com>2015-06-25 21:38:54 +0300
committergraehl <graehl@gmail.com>2015-06-25 22:13:53 +0300
commit363c73cacf94d965a8759ae8b55f56d8c1c29bb1 (patch)
treedd281ad350c508d766327988152a900acc851b95
parent5fbf9611d24b6fd80c5839fe547f1edb141fa162 (diff)
tab fix
-rw-r--r--.gitignore1
-rw-r--r--src/model.cpp480
-rw-r--r--src/neuralClasses.h1624
3 files changed, 1053 insertions, 1052 deletions
diff --git a/.gitignore b/.gitignore
index 12fab12..2843613 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ src/testNeuralLM
src/testNeuralNetwork
src/trainNeuralNetwork
.history
+src/make.sh
diff --git a/src/model.cpp b/src/model.cpp
index 919e005..db7f006 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -13,295 +13,295 @@ namespace nplm
{
void model::resize(int ngram_size,
- int input_vocab_size,
- int output_vocab_size,
- int input_embedding_dimension,
- int num_hidden,
- int output_embedding_dimension)
+ int input_vocab_size,
+ int output_vocab_size,
+ int input_embedding_dimension,
+ int num_hidden,
+ int output_embedding_dimension)
{
- input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
- if (num_hidden == 0)
- {
- first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
- first_hidden_activation.resize(output_embedding_dimension);
- second_hidden_linear.resize(1,1);
- second_hidden_activation.resize(1);
- }
- else
- {
- first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
- first_hidden_activation.resize(num_hidden);
- second_hidden_linear.resize(output_embedding_dimension, num_hidden);
- second_hidden_activation.resize(output_embedding_dimension);
- }
- output_layer.resize(output_vocab_size, output_embedding_dimension);
- this->ngram_size = ngram_size;
- this->input_vocab_size = input_vocab_size;
- this->output_vocab_size = output_vocab_size;
- this->input_embedding_dimension = input_embedding_dimension;
- this->num_hidden = num_hidden;
- this->output_embedding_dimension = output_embedding_dimension;
- premultiplied = false;
+ input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
+ if (num_hidden == 0)
+ {
+ first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
+ first_hidden_activation.resize(output_embedding_dimension);
+ second_hidden_linear.resize(1,1);
+ second_hidden_activation.resize(1);
+ }
+ else
+ {
+ first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+ first_hidden_activation.resize(num_hidden);
+ second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+ second_hidden_activation.resize(output_embedding_dimension);
+ }
+ output_layer.resize(output_vocab_size, output_embedding_dimension);
+ this->ngram_size = ngram_size;
+ this->input_vocab_size = input_vocab_size;
+ this->output_vocab_size = output_vocab_size;
+ this->input_embedding_dimension = input_embedding_dimension;
+ this->num_hidden = num_hidden;
+ this->output_embedding_dimension = output_embedding_dimension;
+ premultiplied = false;
}
-
+
void model::initialize(boost::random::mt19937 &init_engine,
- bool init_normal,
- double init_range,
- double init_bias,
- string &parameter_update,
- double adagrad_epsilon)
+ bool init_normal,
+ double init_range,
+ double init_bias,
+ string &parameter_update,
+ double adagrad_epsilon)
{
- input_layer.initialize(init_engine,
- init_normal,
- init_range,
- parameter_update,
- adagrad_epsilon);
- output_layer.initialize(init_engine,
- init_normal,
- init_range,
- init_bias,
- parameter_update,
- adagrad_epsilon);
- first_hidden_linear.initialize(init_engine,
- init_normal,
- init_range,
- parameter_update,
- adagrad_epsilon);
- second_hidden_linear.initialize(init_engine,
- init_normal,
- init_range,
- parameter_update,
- adagrad_epsilon);
+ input_layer.initialize(init_engine,
+ init_normal,
+ init_range,
+ parameter_update,
+ adagrad_epsilon);
+ output_layer.initialize(init_engine,
+ init_normal,
+ init_range,
+ init_bias,
+ parameter_update,
+ adagrad_epsilon);
+ first_hidden_linear.initialize(init_engine,
+ init_normal,
+ init_range,
+ parameter_update,
+ adagrad_epsilon);
+ second_hidden_linear.initialize(init_engine,
+ init_normal,
+ init_range,
+ parameter_update,
+ adagrad_epsilon);
}
void model::premultiply()
{
- // Since input and first_hidden_linear are both linear,
- // we can multiply them into a single linear layer *if* we are not training
- int context_size = ngram_size-1;
- Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
- if (num_hidden == 0)
- {
- first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
- }
- else
- {
- first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
- }
- for (int i=0; i<context_size; i++)
- first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
- input_layer.W->resize(1,1); // try to save some memory
- premultiplied = true;
+ // Since input and first_hidden_linear are both linear,
+ // we can multiply them into a single linear layer *if* we are not training
+ int context_size = ngram_size-1;
+ Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
+ if (num_hidden == 0)
+ {
+ first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
+ }
+ else
+ {
+ first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+ }
+ for (int i=0; i<context_size; i++)
+ first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
+ input_layer.W->resize(1,1); // try to save some memory
+ premultiplied = true;
}
void model::readConfig(ifstream &config_file)
{
- string line;
- vector<string> fields;
- int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
- activation_function_type activation_function = this->activation_function;
- while (getline(config_file, line) && line != "")
+ string line;
+ vector<string> fields;
+ int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
+ activation_function_type activation_function = this->activation_function;
+ while (getline(config_file, line) && line != "")
+ {
+ splitBySpace(line, fields);
+ if (fields[0] == "ngram_size")
+ ngram_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "vocab_size")
+ input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "input_vocab_size")
+ input_vocab_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "output_vocab_size")
+ output_vocab_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "input_embedding_dimension")
+ input_embedding_dimension = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "num_hidden")
+ num_hidden = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "output_embedding_dimension")
+ output_embedding_dimension = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "activation_function")
+ activation_function = string_to_activation_function(fields[1]);
+ else if (fields[0] == "version")
{
- splitBySpace(line, fields);
- if (fields[0] == "ngram_size")
- ngram_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "vocab_size")
- input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "input_vocab_size")
- input_vocab_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "output_vocab_size")
- output_vocab_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "input_embedding_dimension")
- input_embedding_dimension = lexical_cast<int>(fields[1]);
- else if (fields[0] == "num_hidden")
- num_hidden = lexical_cast<int>(fields[1]);
- else if (fields[0] == "output_embedding_dimension")
- output_embedding_dimension = lexical_cast<int>(fields[1]);
- else if (fields[0] == "activation_function")
- activation_function = string_to_activation_function(fields[1]);
- else if (fields[0] == "version")
- {
- int version = lexical_cast<int>(fields[1]);
- if (version != 1)
- {
- cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
- exit(1);
- }
- }
- else
- cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+ int version = lexical_cast<int>(fields[1]);
+ if (version != 1)
+ {
+ cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
+ exit(1);
+ }
}
- resize(ngram_size,
- input_vocab_size,
- output_vocab_size,
- input_embedding_dimension,
- num_hidden,
- output_embedding_dimension);
- set_activation_function(activation_function);
+ else
+ cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+ }
+ resize(ngram_size,
+ input_vocab_size,
+ output_vocab_size,
+ input_embedding_dimension,
+ num_hidden,
+ output_embedding_dimension);
+ set_activation_function(activation_function);
}
void model::readConfig(const string &filename)
{
- ifstream config_file(filename.c_str());
- if (!config_file)
- {
- cerr << "error: could not open config file " << filename << endl;
- exit(1);
- }
- readConfig(config_file);
- config_file.close();
+ ifstream config_file(filename.c_str());
+ if (!config_file)
+ {
+ cerr << "error: could not open config file " << filename << endl;
+ exit(1);
+ }
+ readConfig(config_file);
+ config_file.close();
}
-
+
void model::read(const string &filename)
{
- vector<string> input_words;
- vector<string> output_words;
- read(filename, input_words, output_words);
+ vector<string> input_words;
+ vector<string> output_words;
+ read(filename, input_words, output_words);
}
void model::read(const string &filename, vector<string> &words)
{
- vector<string> output_words;
- read(filename, words, output_words);
+ vector<string> output_words;
+ read(filename, words, output_words);
}
void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words)
{
- ifstream file(filename.c_str());
- if (!file) throw runtime_error("Could not open file " + filename);
-
- param myParam;
- string line;
-
- while (getline(file, line))
+ ifstream file(filename.c_str());
+ if (!file) throw runtime_error("Could not open file " + filename);
+
+ param myParam;
+ string line;
+
+ while (getline(file, line))
+ {
+ if (line == "\\config")
+ {
+ readConfig(file);
+ }
+
+ else if (line == "\\vocab")
+ {
+ input_words.clear();
+ readWordsFile(file, input_words);
+ output_words = input_words;
+ }
+
+ else if (line == "\\input_vocab")
{
- if (line == "\\config")
- {
- readConfig(file);
- }
-
- else if (line == "\\vocab")
- {
- input_words.clear();
- readWordsFile(file, input_words);
- output_words = input_words;
- }
-
- else if (line == "\\input_vocab")
- {
- input_words.clear();
- readWordsFile(file, input_words);
- }
-
- else if (line == "\\output_vocab")
- {
- output_words.clear();
- readWordsFile(file, output_words);
- }
-
- else if (line == "\\input_embeddings")
- input_layer.read(file);
- else if (line == "\\hidden_weights 1")
- first_hidden_linear.read_weights(file);
- else if (line == "\\hidden_biases 1")
- first_hidden_linear.read_biases (file);
- else if (line == "\\hidden_weights 2")
- second_hidden_linear.read_weights(file);
- else if (line == "\\hidden_biases 2")
- second_hidden_linear.read_biases (file);
- else if (line == "\\output_weights")
- output_layer.read_weights(file);
- else if (line == "\\output_biases")
- output_layer.read_biases(file);
- else if (line == "\\end")
- break;
- else if (line == "")
- continue;
- else
- {
- cerr << "warning: unrecognized section: " << line << endl;
- // skip over section
- while (getline(file, line) && line != "") { }
- }
+ input_words.clear();
+ readWordsFile(file, input_words);
}
- file.close();
+
+ else if (line == "\\output_vocab")
+ {
+ output_words.clear();
+ readWordsFile(file, output_words);
+ }
+
+ else if (line == "\\input_embeddings")
+ input_layer.read(file);
+ else if (line == "\\hidden_weights 1")
+ first_hidden_linear.read_weights(file);
+ else if (line == "\\hidden_biases 1")
+ first_hidden_linear.read_biases (file);
+ else if (line == "\\hidden_weights 2")
+ second_hidden_linear.read_weights(file);
+ else if (line == "\\hidden_biases 2")
+ second_hidden_linear.read_biases (file);
+ else if (line == "\\output_weights")
+ output_layer.read_weights(file);
+ else if (line == "\\output_biases")
+ output_layer.read_biases(file);
+ else if (line == "\\end")
+ break;
+ else if (line == "")
+ continue;
+ else
+ {
+ cerr << "warning: unrecognized section: " << line << endl;
+ // skip over section
+ while (getline(file, line) && line != "") { }
+ }
+ }
+ file.close();
}
void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
-{
- write(filename, &input_words, &output_words);
+{
+ write(filename, &input_words, &output_words);
}
void model::write(const string &filename, const vector<string> &words)
-{
- write(filename, &words, NULL);
+{
+ write(filename, &words, NULL);
}
-void model::write(const string &filename)
-{
- write(filename, NULL, NULL);
+void model::write(const string &filename)
+{
+ write(filename, NULL, NULL);
}
void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
{
- ofstream file(filename.c_str());
- if (!file) throw runtime_error("Could not open file " + filename);
-
- file << "\\config" << endl;
- file << "version 1" << endl;
- file << "ngram_size " << ngram_size << endl;
- file << "input_vocab_size " << input_vocab_size << endl;
- file << "output_vocab_size " << output_vocab_size << endl;
- file << "input_embedding_dimension " << input_embedding_dimension << endl;
- file << "num_hidden " << num_hidden << endl;
- file << "output_embedding_dimension " << output_embedding_dimension << endl;
- file << "activation_function " << activation_function_to_string(activation_function) << endl;
- file << endl;
-
- if (input_pwords)
- {
- file << "\\input_vocab" << endl;
- writeWordsFile(*input_pwords, file);
- file << endl;
- }
+ ofstream file(filename.c_str());
+ if (!file) throw runtime_error("Could not open file " + filename);
- if (output_pwords)
- {
- file << "\\output_vocab" << endl;
- writeWordsFile(*output_pwords, file);
- file << endl;
- }
+ file << "\\config" << endl;
+ file << "version 1" << endl;
+ file << "ngram_size " << ngram_size << endl;
+ file << "input_vocab_size " << input_vocab_size << endl;
+ file << "output_vocab_size " << output_vocab_size << endl;
+ file << "input_embedding_dimension " << input_embedding_dimension << endl;
+ file << "num_hidden " << num_hidden << endl;
+ file << "output_embedding_dimension " << output_embedding_dimension << endl;
+ file << "activation_function " << activation_function_to_string(activation_function) << endl;
+ file << endl;
- file << "\\input_embeddings" << endl;
- input_layer.write(file);
- file << endl;
-
- file << "\\hidden_weights 1" << endl;
- first_hidden_linear.write_weights(file);
+ if (input_pwords)
+ {
+ file << "\\input_vocab" << endl;
+ writeWordsFile(*input_pwords, file);
file << endl;
+ }
- file << "\\hidden_biases 1" << endl;
- first_hidden_linear.write_biases(file);
- file <<endl;
-
- file << "\\hidden_weights 2" << endl;
- second_hidden_linear.write_weights(file);
+ if (output_pwords)
+ {
+ file << "\\output_vocab" << endl;
+ writeWordsFile(*output_pwords, file);
file << endl;
+ }
- file << "\\hidden_biases 2" << endl;
- second_hidden_linear.write_biases(file);
- file << endl;
-
- file << "\\output_weights" << endl;
- output_layer.write_weights(file);
- file << endl;
-
- file << "\\output_biases" << endl;
- output_layer.write_biases(file);
- file << endl;
-
- file << "\\end" << endl;
- file.close();
+ file << "\\input_embeddings" << endl;
+ input_layer.write(file);
+ file << endl;
+
+ file << "\\hidden_weights 1" << endl;
+ first_hidden_linear.write_weights(file);
+ file << endl;
+
+ file << "\\hidden_biases 1" << endl;
+ first_hidden_linear.write_biases(file);
+ file <<endl;
+
+ file << "\\hidden_weights 2" << endl;
+ second_hidden_linear.write_weights(file);
+ file << endl;
+
+ file << "\\hidden_biases 2" << endl;
+ second_hidden_linear.write_biases(file);
+ file << endl;
+
+ file << "\\output_weights" << endl;
+ output_layer.write_weights(file);
+ file << endl;
+
+ file << "\\output_biases" << endl;
+ output_layer.write_biases(file);
+ file << endl;
+
+ file << "\\end" << endl;
+ file.close();
}
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index ee7c3f0..7c86694 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -43,36 +43,36 @@ struct Clipper{
class Linear_layer
{
- private:
- Matrix<double,Dynamic,Dynamic> U;
- Matrix<double,Dynamic,Dynamic> U_gradient;
- Matrix<double,Dynamic,Dynamic> U_velocity;
- Matrix<double,Dynamic,Dynamic> U_running_gradient;
- Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
- // Biases
- Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,1> b_velocity;
- Matrix<double,Dynamic,1> b_running_gradient;
- Matrix<double,Dynamic,1> b_running_parameter_update;
- Matrix<double,Dynamic,1> b_gradient;
-
- friend class model;
-
- public:
+ private:
+ Matrix<double,Dynamic,Dynamic> U;
+ Matrix<double,Dynamic,Dynamic> U_gradient;
+ Matrix<double,Dynamic,Dynamic> U_velocity;
+ Matrix<double,Dynamic,Dynamic> U_running_gradient;
+ Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+ // Biases
+ Matrix<double,Dynamic,1> b;
+ Matrix<double,Dynamic,1> b_velocity;
+ Matrix<double,Dynamic,1> b_running_gradient;
+ Matrix<double,Dynamic,1> b_running_parameter_update;
+ Matrix<double,Dynamic,1> b_gradient;
+
+ friend class model;
+
+ public:
Linear_layer() { }
- Linear_layer(int rows, int cols) { resize(rows, cols); }
+ Linear_layer(int rows, int cols) { resize(rows, cols); }
void resize(int rows, int cols)
{
- U.setZero(rows, cols);
- U_gradient.setZero(rows, cols);
- //U_running_gradient.setZero(rows, cols);
- //U_running_parameter_updates.setZero(rows, cols);
- //U_velocity.setZero(rows, cols);
- b.resize(rows);
- b_gradient.setZero(rows);
- //b_running_gradient.resize(rows);
- //b_velocity.resize(rows);
+ U.setZero(rows, cols);
+ U_gradient.setZero(rows, cols);
+ //U_running_gradient.setZero(rows, cols);
+ //U_running_parameter_updates.setZero(rows, cols);
+ //U_velocity.setZero(rows, cols);
+ b.resize(rows);
+ b_gradient.setZero(rows);
+ //b_running_gradient.resize(rows);
+ //b_velocity.resize(rows);
}
void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
@@ -83,24 +83,24 @@ class Linear_layer
template <typename Engine>
void initialize(Engine &engine,
- bool init_normal,
- double init_range,
- string &parameter_update,
- double adagrad_epsilon)
- {
- if (parameter_update == "ADA") {
- U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
- b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
- }
- if (parameter_update == "ADAD") {
- U_running_gradient.setZero(U.rows(),U.cols());
- b_running_gradient.setZero(b.size());
- U_running_parameter_update.setZero(U.rows(),U.cols());
- b_running_parameter_update.setZero(b.size());
- }
+ bool init_normal,
+ double init_range,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+ if (parameter_update == "ADA") {
+ U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ }
+ if (parameter_update == "ADAD") {
+ U_running_gradient.setZero(U.rows(),U.cols());
+ b_running_gradient.setZero(b.size());
+ U_running_parameter_update.setZero(U.rows(),U.cols());
+ b_running_parameter_update.setZero(b.size());
+ }
- initMatrix(engine, U, init_normal, init_range);
- initBias(engine, b, init_normal, init_range);
+ initMatrix(engine, U, init_normal, init_range);
+ initBias(engine, b, init_normal, init_range);
}
int n_inputs () const { return U.cols(); }
@@ -108,655 +108,655 @@ class Linear_layer
template <typename DerivedIn, typename DerivedOut>
void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOut> &output) const
+ const MatrixBase<DerivedOut> &output) const
{
- UNCONST(DerivedOut, output, my_output);
- my_output.leftCols(input.cols()).noalias() = U*input;
- int num_examples = input.cols();
- for (int example = 0;example < num_examples;example++)
- {
- my_output.leftCols(input.cols()).col(example) += b;
- }
+ UNCONST(DerivedOut, output, my_output);
+ my_output.leftCols(input.cols()).noalias() = U*input;
+ int num_examples = input.cols();
+ for (int example = 0;example < num_examples;example++)
+ {
+ my_output.leftCols(input.cols()).col(example) += b;
+ }
}
// Sparse input
template <typename ScalarIn, typename DerivedOut>
void fProp(const USCMatrix<ScalarIn> &input,
- const MatrixBase<DerivedOut> &output_const) const
- {
- UNCONST(DerivedOut, output_const, output);
- output.setZero();
- uscgemm(1.0, U, input, output.leftCols(input.cols()));
- // Each column corresponds to a training example. We
- // parallelize the adding of biases per dimension.
- int num_examples = input.cols();
- for (int example = 0;example < num_examples;example++)
- {
- output.leftCols(input.cols()).col(example) += b;
- }
+ const MatrixBase<DerivedOut> &output_const) const
+ {
+ UNCONST(DerivedOut, output_const, output);
+ output.setZero();
+ uscgemm(1.0, U, input, output.leftCols(input.cols()));
+ // Each column corresponds to a training example. We
+ // parallelize the adding of biases per dimension.
+ int num_examples = input.cols();
+ for (int example = 0;example < num_examples;example++)
+ {
+ output.leftCols(input.cols()).col(example) += b;
+ }
}
template <typename DerivedGOut, typename DerivedGIn>
void bProp(const MatrixBase<DerivedGOut> &input,
- MatrixBase<DerivedGIn> &output) const
+ MatrixBase<DerivedGIn> &output) const
{
- UNCONST(DerivedGIn, output, my_output);
- my_output.noalias() = U.transpose()*input;
+ UNCONST(DerivedGIn, output, my_output);
+ my_output.noalias() = U.transpose()*input;
}
template <typename DerivedGOut, typename DerivedIn>
void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate, double momentum, double L2_reg)
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate, double momentum, double L2_reg)
{
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
- // get the bias gradient for all dimensions in parallel
- int size = b.size();
- b_gradient = bProp_input.rowwise().sum();
- // This used to be multithreaded, but there was no measureable difference
- if (L2_reg > 0.0)
- {
- U_gradient -= 2*L2_reg*U;
- b_gradient -= 2*L2_reg*b;
- }
- if (momentum > 0.0)
- {
- U_velocity = momentum*U_velocity + U_gradient;
- U += learning_rate * U_velocity;
- b_velocity = momentum*b_velocity + b_gradient;
- b += learning_rate * b_velocity;
- }
- else
- {
- U += learning_rate * U_gradient;
- b += learning_rate * b_gradient;
- /*
- //UPDATE CLIPPING
- U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
- b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
- //GRADIENT CLIPPING
- //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
- //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
- */
- }
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient = bProp_input.rowwise().sum();
+ // This used to be multithreaded, but there was no measureable difference
+ if (L2_reg > 0.0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+ if (momentum > 0.0)
+ {
+ U_velocity = momentum*U_velocity + U_gradient;
+ U += learning_rate * U_velocity;
+ b_velocity = momentum*b_velocity + b_gradient;
+ b += learning_rate * b_velocity;
+ }
+ else
+ {
+ U += learning_rate * U_gradient;
+ b += learning_rate * b_gradient;
+ /*
+ //UPDATE CLIPPING
+ U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+ b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+ //GRADIENT CLIPPING
+ //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+ //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+ */
+ }
}
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate,
- double L2_reg)
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate,
+ double L2_reg)
{
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
- // get the bias gradient for all dimensions in parallel
- int size = b.size();
- b_gradient.noalias() = bProp_input.rowwise().sum();
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
- if (L2_reg != 0)
- {
- U_gradient -= 2*L2_reg*U;
- b_gradient -= 2*L2_reg*b;
- }
+ if (L2_reg != 0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
- // ignore momentum?
- #pragma omp parallel for
- for (int col=0; col<U.cols(); col++) {
- U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
- U.col(col) += learning_rate * (U_gradient.col(col).array() /
- U_running_gradient.col(col).array().sqrt()).matrix();
- /*
- //UPDATE CLIPPING
- U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
- unaryExpr(Clipper()).matrix();
- */
- }
- b_running_gradient += b_gradient.array().square().matrix();
- b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ // ignore momentum?
+#pragma omp parallel for
+ for (int col=0; col<U.cols(); col++) {
+ U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+ U.col(col) += learning_rate * (U_gradient.col(col).array() /
+ U_running_gradient.col(col).array().sqrt()).matrix();
/*
//UPDATE CLIPPING
- b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+ unaryExpr(Clipper()).matrix();
*/
+ }
+ b_running_gradient += b_gradient.array().square().matrix();
+ b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ */
}
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate,
- double L2_reg,
- double conditioning_constant,
- double decay)
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate,
+ double L2_reg,
+ double conditioning_constant,
+ double decay)
{
- //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
+ //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
- Array<double,Dynamic,1> b_current_parameter_update;
+ Array<double,Dynamic,1> b_current_parameter_update;
- // get the bias gradient for all dimensions in parallel
- int size = b.size();
- b_gradient.noalias() = bProp_input.rowwise().sum();
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
- if (L2_reg != 0)
- {
- U_gradient -= 2*L2_reg*U;
- b_gradient -= 2*L2_reg*b;
- }
+ if (L2_reg != 0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
- // ignore momentum?
- #pragma omp parallel for
- //cerr<<"U gradient is "<<U_gradient<<endl;
- for (int col=0; col<U.cols(); col++) {
- Array<double,Dynamic,1> U_current_parameter_update;
- U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
- (1-decay)*U_gradient.col(col).array().square().matrix();
- //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
- //getchar();
- U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
- (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
- U_gradient.col(col).array();
- //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
- //getchar();
- //update the running parameter update
- U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
- (1.-decay)*U_current_parameter_update.square().matrix();
- U.col(col) += learning_rate*U_current_parameter_update.matrix();
- }
- b_running_gradient = decay*b_running_gradient +
- (1.-decay)*b_gradient.array().square().matrix();
- b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
- (b_running_gradient.array()+conditioning_constant).sqrt()) *
- b_gradient.array();
- b_running_parameter_update = decay*(b_running_parameter_update) +
- (1.-decay)*b_current_parameter_update.square().matrix();
- b += learning_rate*b_current_parameter_update.matrix();
+ // ignore momentum?
+#pragma omp parallel for
+ //cerr<<"U gradient is "<<U_gradient<<endl;
+ for (int col=0; col<U.cols(); col++) {
+ Array<double,Dynamic,1> U_current_parameter_update;
+ U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
+ (1-decay)*U_gradient.col(col).array().square().matrix();
+ //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+ //getchar();
+ U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+ (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+ U_gradient.col(col).array();
+ //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+ //getchar();
+ //update the running parameter update
+ U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+ (1.-decay)*U_current_parameter_update.square().matrix();
+ U.col(col) += learning_rate*U_current_parameter_update.matrix();
+ }
+ b_running_gradient = decay*b_running_gradient +
+ (1.-decay)*b_gradient.array().square().matrix();
+ b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (b_running_gradient.array()+conditioning_constant).sqrt()) *
+ b_gradient.array();
+ b_running_parameter_update = decay*(b_running_parameter_update) +
+ (1.-decay)*b_current_parameter_update.square().matrix();
+ b += learning_rate*b_current_parameter_update.matrix();
}
template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- const MatrixBase<DerivedGW> &gradient) const
+ const MatrixBase<DerivedIn> &fProp_input,
+ const MatrixBase<DerivedGW> &gradient) const
{
- UNCONST(DerivedGW, gradient, my_gradient);
- my_gradient.noalias() = bProp_input*fProp_input.transpose();
+ UNCONST(DerivedGW, gradient, my_gradient);
+ my_gradient.noalias() = bProp_input*fProp_input.transpose();
}
};
class Output_word_embeddings
{
- private:
- // row-major is better for uscgemm
- //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
- // Having W be a pointer to a matrix allows ease of sharing
- // input and output word embeddings
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
- std::vector<double> W_data;
- Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
- Matrix<double,Dynamic,1> b_running_gradient;
- Matrix<double,Dynamic,1> b_gradient;
- Matrix<double,Dynamic,1> b_running_parameter_update;
-
- public:
- Output_word_embeddings() { }
- Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
-
- void resize(int rows, int cols)
- {
- W->setZero(rows, cols);
- b.setZero(rows);
- }
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
- W = input_W;
- }
- void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
- void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
- void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
- void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
-
- template <typename Engine>
- void initialize(Engine &engine,
- bool init_normal,
- double init_range,
- double init_bias,
- string &parameter_update,
- double adagrad_epsilon)
- {
+ private:
+ // row-major is better for uscgemm
+ //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
+ // Having W be a pointer to a matrix allows ease of sharing
+ // input and output word embeddings
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+ std::vector<double> W_data;
+ Matrix<double,Dynamic,1> b;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<double,Dynamic,1> b_running_gradient;
+ Matrix<double,Dynamic,1> b_gradient;
+ Matrix<double,Dynamic,1> b_running_parameter_update;
+
+ public:
+ Output_word_embeddings() { }
+ Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
- W_gradient.setZero(W->rows(),W->cols());
- b_gradient.setZero(b.size());
- if (parameter_update == "ADA") {
- W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
- b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
- //W_gradient.setZero(W->rows(),W->cols());
- //b_gradient.setZero(b.size());
- }
- if (parameter_update == "ADAD") {
- W_running_gradient.setZero(W->rows(),W->cols());
- b_running_gradient.setZero(b.size());
- W_gradient.setZero(W->rows(),W->cols());
- //b_gradient.setZero(b.size());
- //W_running_parameter_update.setZero(W->rows(),W->cols());
- b_running_parameter_update.setZero(b.size());
- }
-
- initMatrix(engine, *W, init_normal, init_range);
- b.fill(init_bias);
- }
-
- int n_inputs () const { return W->cols(); }
- int n_outputs () const { return W->rows(); }
-
- template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOut> &output) const
- {
- UNCONST(DerivedOut, output, my_output);
- my_output = ((*W) * input).colwise() + b;
+ void resize(int rows, int cols)
+ {
+ W->setZero(rows, cols);
+ b.setZero(rows);
+ }
+ void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ W = input_W;
+ }
+ void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
+ void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+ void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+ void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
+
+ template <typename Engine>
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ double init_bias,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+
+ W_gradient.setZero(W->rows(),W->cols());
+ b_gradient.setZero(b.size());
+ if (parameter_update == "ADA") {
+ W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ //W_gradient.setZero(W->rows(),W->cols());
+ //b_gradient.setZero(b.size());
+ }
+ if (parameter_update == "ADAD") {
+ W_running_gradient.setZero(W->rows(),W->cols());
+ b_running_gradient.setZero(b.size());
+ W_gradient.setZero(W->rows(),W->cols());
+ //b_gradient.setZero(b.size());
+ //W_running_parameter_update.setZero(W->rows(),W->cols());
+ b_running_parameter_update.setZero(b.size());
}
+ initMatrix(engine, *W, init_normal, init_range);
+ b.fill(init_bias);
+ }
+
+ int n_inputs () const { return W->cols(); }
+ int n_outputs () const { return W->rows(); }
+
+ template <typename DerivedIn, typename DerivedOut>
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOut> &output) const
+ {
+ UNCONST(DerivedOut, output, my_output);
+ my_output = ((*W) * input).colwise() + b;
+ }
+
// Sparse output version
- template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOutI> &samples,
- const MatrixBase<DerivedOutV> &output) const
- {
- UNCONST(DerivedOutV, output, my_output);
- #pragma omp parallel for
- for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
- {
- for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
- {
- my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
- }
- }
- USCMatrix<double> sparse_output(W->rows(), samples, my_output);
- uscgemm_masked(1.0, *W, input, sparse_output);
- my_output = sparse_output.values; // too bad, so much copying
- }
-
- // Return single element of output matrix
- template <typename DerivedIn>
- double fProp(const MatrixBase<DerivedIn> &input,
- int word,
- int instance) const
+ template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOutI> &samples,
+ const MatrixBase<DerivedOutV> &output) const
+ {
+ UNCONST(DerivedOutV, output, my_output);
+#pragma omp parallel for
+ for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
{
- return W->row(word).dot(input.col(instance)) + b(word);
+ for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+ {
+ my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+ }
}
+ USCMatrix<double> sparse_output(W->rows(), samples, my_output);
+ uscgemm_masked(1.0, *W, input, sparse_output);
+ my_output = sparse_output.values; // too bad, so much copying
+ }
- // Dense versions (for log-likelihood loss)
+ // Return single element of output matrix
+ template <typename DerivedIn>
+ double fProp(const MatrixBase<DerivedIn> &input,
+ int word,
+ int instance) const
+ {
+ return W->row(word).dot(input.col(instance)) + b(word);
+ }
- template <typename DerivedGOut, typename DerivedGIn>
- void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
- const MatrixBase<DerivedGIn> &bProp_matrix) const
- {
- // W is vocab_size x output_embedding_dimension
- // input_bProp_matrix is vocab_size x minibatch_size
- // bProp_matrix is output_embedding_dimension x minibatch_size
- UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
- my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
+ // Dense versions (for log-likelihood loss)
+
+ template <typename DerivedGOut, typename DerivedGIn>
+ void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
+ const MatrixBase<DerivedGIn> &bProp_matrix) const
+ {
+ // W is vocab_size x output_embedding_dimension
+ // input_bProp_matrix is vocab_size x minibatch_size
+ // bProp_matrix is output_embedding_dimension x minibatch_size
+ UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+ my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
W->transpose() * input_bProp_matrix;
- }
+ }
- template <typename DerivedIn, typename DerivedGOut>
- void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate,
- double momentum) //not sure if we want to use momentum here
- {
- // W is vocab_size x output_embedding_dimension
- // b is vocab_size x 1
- // predicted_embeddings is output_embedding_dimension x minibatch_size
- // bProp_input is vocab_size x minibatch_size
- W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
- b += learning_rate * bProp_input.rowwise().sum();
-
- /*
- //GRADIENT CLIPPING
- W->noalias() += learning_rate *
- ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
- b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
- //UPDATE CLIPPING
- W->noalias() += (learning_rate *
- (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
- b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
- */
- }
-
- template <typename DerivedIn, typename DerivedGOut>
- void computeGradientAdagrad(
- const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate) //not sure if we want to use momentum here
- {
- // W is vocab_size x output_embedding_dimension
- // b is vocab_size x 1
- // predicted_embeddings is output_embedding_dimension x minibatch_size
- // bProp_input is vocab_size x minibatch_sizea
- W_gradient.setZero(W->rows(), W->cols());
- b_gradient.setZero(b.size());
- W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
- b_gradient.noalias() = bProp_input.rowwise().sum();
- W_running_gradient += W_gradient.array().square().matrix();
- b_running_gradient += b_gradient.array().square().matrix();
- W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
- b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
- /*
- //UPDATE CLIPPING
- *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
- b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
- */
- }
-
- template <typename DerivedIn, typename DerivedGOut>
- void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate,
- double conditioning_constant,
- double decay) //not sure if we want to use momentum here
- {
- // W is vocab_size x output_embedding_dimension
- // b is vocab_size x 1
- // predicted_embeddings is output_embedding_dimension x minibatch_size
- // bProp_input is vocab_size x minibatch_size
- Array<double,Dynamic,Dynamic> W_current_parameter_update;
- Array<double,Dynamic,1> b_current_parameter_update;
- W_gradient.setZero(W->rows(), W->cols());
- b_gradient.setZero(b.size());
- W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
- b_gradient.noalias() = bProp_input.rowwise().sum();
- W_running_gradient = decay*W_running_gradient +
- (1.-decay)*W_gradient.array().square().matrix();
- b_running_gradient = decay*b_running_gradient+
- (1.-decay)*b_gradient.array().square().matrix();
- W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
- (W_running_gradient.array()+conditioning_constant).sqrt())*
- W_gradient.array();
- b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
- (b_running_gradient.array()+conditioning_constant).sqrt())*
- b_gradient.array();
- W_running_parameter_update = decay*W_running_parameter_update +
- (1.-decay)*W_current_parameter_update.square().matrix();
- b_running_parameter_update = decay*b_running_parameter_update +
- (1.-decay)*b_current_parameter_update.square().matrix();
-
- *W += learning_rate*W_current_parameter_update.matrix();
- b += learning_rate*b_current_parameter_update.matrix();
- }
-
- // Sparse versions
-
- template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
- void bProp(const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- const MatrixBase<DerivedGIn> &bProp_matrix) const
- {
- UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
- my_bProp_matrix.setZero();
- uscgemm(1.0,
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate,
+ double momentum) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_size
+ W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
+ b += learning_rate * bProp_input.rowwise().sum();
+
+ /*
+ //GRADIENT CLIPPING
+ W->noalias() += learning_rate *
+ ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+ b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+ //UPDATE CLIPPING
+ W->noalias() += (learning_rate *
+ (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+ b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+ */
+ }
+
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradientAdagrad(
+ const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_sizea
+ W_gradient.setZero(W->rows(), W->cols());
+ b_gradient.setZero(b.size());
+ W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+ W_running_gradient += W_gradient.array().square().matrix();
+ b_running_gradient += b_gradient.array().square().matrix();
+ W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+ b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ */
+ }
+
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate,
+ double conditioning_constant,
+ double decay) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_size
+ Array<double,Dynamic,Dynamic> W_current_parameter_update;
+ Array<double,Dynamic,1> b_current_parameter_update;
+ W_gradient.setZero(W->rows(), W->cols());
+ b_gradient.setZero(b.size());
+ W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+ W_running_gradient = decay*W_running_gradient +
+ (1.-decay)*W_gradient.array().square().matrix();
+ b_running_gradient = decay*b_running_gradient+
+ (1.-decay)*b_gradient.array().square().matrix();
+ W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (W_running_gradient.array()+conditioning_constant).sqrt())*
+ W_gradient.array();
+ b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (b_running_gradient.array()+conditioning_constant).sqrt())*
+ b_gradient.array();
+ W_running_parameter_update = decay*W_running_parameter_update +
+ (1.-decay)*W_current_parameter_update.square().matrix();
+ b_running_parameter_update = decay*b_running_parameter_update +
+ (1.-decay)*b_current_parameter_update.square().matrix();
+
+ *W += learning_rate*W_current_parameter_update.matrix();
+ b += learning_rate*b_current_parameter_update.matrix();
+ }
+
+ // Sparse versions
+
+ template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
+ void bProp(const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ const MatrixBase<DerivedGIn> &bProp_matrix) const
+ {
+ UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+ my_bProp_matrix.setZero();
+ uscgemm(1.0,
W->transpose(),
USCMatrix<double>(W->rows(), samples, weights),
my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
- }
+ }
template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
- void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- double learning_rate, double momentum) //not sure if we want to use momentum here
+ void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate, double momentum) //not sure if we want to use momentum here
{
- //cerr<<"in gradient"<<endl;
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(learning_rate,
- gradient_output,
- predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
- *W); // narrow predicted_embeddings for possible short minibatch
- uscgemv(learning_rate,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
- b);
- /*
- //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
- //FIRST
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- W_gradient);
- uscgemv(1.0,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
- b_gradient);
-
- int_map update_map; //stores all the parameters that have been updated
- for (int sample_id=0; sample_id<samples.rows(); sample_id++)
- for (int train_id=0; train_id<samples.cols(); train_id++)
- update_map[samples(sample_id, train_id)] = 1;
-
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- update_items.push_back(it->first);
- int num_items = update_items.size();
-
- //#pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- //W->row(update_item) += learning_rate * W_gradient.row(update_item);
- //b(update_item) += learning_rate * b_gradient(update_item);
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
- double update = learning_rate * b_gradient(update_item);
- b(update_item) += std::min(0.5, std::max(update,-0.5));
- //GRADIENT CLIPPING
- W_gradient.row(update_item).setZero();
- b_gradient(update_item) = 0.;
- }
- */
- //cerr<<"Finished gradient"<<endl;
+ //cerr<<"in gradient"<<endl;
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(learning_rate,
+ gradient_output,
+ predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
+ *W); // narrow predicted_embeddings for possible short minibatch
+ uscgemv(learning_rate,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+ b);
+ /*
+ //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+ //FIRST
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+ //#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+ //b(update_item) += learning_rate * b_gradient(update_item);
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+ double update = learning_rate * b_gradient(update_item);
+ b(update_item) += std::min(0.5, std::max(update,-0.5));
+ //GRADIENT CLIPPING
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
+ }
+ */
+ //cerr<<"Finished gradient"<<endl;
}
template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
- void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- double learning_rate) //not sure if we want to use momentum here
- {
- //W_gradient.setZero(W->rows(), W->cols());
- //b_gradient.setZero(b.size());
- //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- W_gradient);
- uscgemv(1.0,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
- b_gradient);
-
- int_map update_map; //stores all the parameters that have been updated
- for (int sample_id=0; sample_id<samples.rows(); sample_id++)
- for (int train_id=0; train_id<samples.cols(); train_id++)
- update_map[samples(sample_id, train_id)] = 1;
-
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- update_items.push_back(it->first);
- int num_items = update_items.size();
-
- //#pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
- b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
- W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
- b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
- /*
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
- double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
- b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
- */
- W_gradient.row(update_item).setZero();
- b_gradient(update_item) = 0.;
- }
- }
+ void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate) //not sure if we want to use momentum here
+ {
+ //W_gradient.setZero(W->rows(), W->cols());
+ //b_gradient.setZero(b.size());
+ //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+ //#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+ b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
+ W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+ b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+ /*
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+ double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+ b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+ */
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
+ }
+ }
template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
- void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- double learning_rate,
- double conditioning_constant,
- double decay) //not sure if we want to use momentum here
- {
- //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
- //W_gradient.setZero(W->rows(), W->cols());
- //b_gradient.setZero(b.size());
-
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- W_gradient);
- uscgemv(1.0,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
- b_gradient);
-
- int_map update_map; //stores all the parameters that have been updated
- for (int sample_id=0; sample_id<samples.rows(); sample_id++)
- for (int train_id=0; train_id<samples.cols(); train_id++)
- update_map[samples(sample_id, train_id)] = 1;
-
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- update_items.push_back(it->first);
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- Array<double,1,Dynamic> W_current_parameter_update;
- double b_current_parameter_update;
-
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
- (1.-decay)*W_gradient.row(update_item).array().square().matrix();
- b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
- (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
- //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
- //getchar();
-
- //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
- //getchar();
- W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
- (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
- W_gradient.row(update_item).array();
- b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
- sqrt(b_running_gradient(update_item)+conditioning_constant))*
- b_gradient(update_item);
- //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
- //getchar();
- //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
- //getchar();
- //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
- W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
- (1.-decay)*(W_current_parameter_update.square().matrix());
- b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
- (1.-decay)*b_current_parameter_update*b_current_parameter_update;
- //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
- //getchar();
- W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
- b(update_item) += learning_rate*b_current_parameter_update;
- W_gradient.row(update_item).setZero();
- b_gradient(update_item) = 0.;
- }
- }
+ void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate,
+ double conditioning_constant,
+ double decay) //not sure if we want to use momentum here
+ {
+ //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+ //W_gradient.setZero(W->rows(), W->cols());
+ //b_gradient.setZero(b.size());
+
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ Array<double,1,Dynamic> W_current_parameter_update;
+ double b_current_parameter_update;
+
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+ (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+ b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+ (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+ //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+ //getchar();
+
+ //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+ //getchar();
+ W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+ (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+ W_gradient.row(update_item).array();
+ b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+ sqrt(b_running_gradient(update_item)+conditioning_constant))*
+ b_gradient(update_item);
+ //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+ //getchar();
+ //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+ //getchar();
+ //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+ W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+ (1.-decay)*(W_current_parameter_update.square().matrix());
+ b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+ (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+ //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+ //getchar();
+ W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+ b(update_item) += learning_rate*b_current_parameter_update;
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
+ }
+ }
template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
- void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- const MatrixBase<DerivedGW> &gradient_W,
- const MatrixBase<DerivedGb> &gradient_b) const
- {
- UNCONST(DerivedGW, gradient_W, my_gradient_W);
- UNCONST(DerivedGb, gradient_b, my_gradient_b);
- my_gradient_W.setZero();
- my_gradient_b.setZero();
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- my_gradient_W);
- uscgemv(1.0, gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+ void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ const MatrixBase<DerivedGW> &gradient_W,
+ const MatrixBase<DerivedGb> &gradient_b) const
+ {
+ UNCONST(DerivedGW, gradient_W, my_gradient_W);
+ UNCONST(DerivedGb, gradient_b, my_gradient_b);
+ my_gradient_W.setZero();
+ my_gradient_b.setZero();
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ my_gradient_W);
+ uscgemv(1.0, gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
}
};
class Input_word_embeddings
{
- private:
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
- int context_size, vocab_size;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ private:
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+ int context_size, vocab_size;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
friend class model;
- public:
- Input_word_embeddings() : context_size(0), vocab_size(0) { }
- Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
+ public:
+ Input_word_embeddings() : context_size(0), vocab_size(0) { }
+ Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
- W = input_W;
- }
+ void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ W = input_W;
+ }
- void resize(int rows, int cols, int context)
- {
- context_size = context;
- vocab_size = rows;
- W->setZero(rows, cols);
- }
+ void resize(int rows, int cols, int context)
+ {
+ context_size = context;
+ vocab_size = rows;
+ W->setZero(rows, cols);
+ }
- void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
- void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+ void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
+ void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
- template <typename Engine>
- void initialize(Engine &engine,
- bool init_normal,
- double init_range,
- string &parameter_update,
- double adagrad_epsilon)
- {
- W_gradient.setZero(W->rows(),W->cols());
-
- if (parameter_update == "ADA") {
- W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
- //W_gradient.setZero(W->rows(),W->cols());
- }
- if (parameter_update == "ADAD") {
- W_running_gradient.setZero(W->rows(),W->cols());
- //W_gradient.setZero(W->rows(),W->cols());
- W_running_parameter_update.setZero(W->rows(),W->cols());
- }
- initMatrix(engine,
- *W,
- init_normal,
- init_range);
- }
+ template <typename Engine>
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+ W_gradient.setZero(W->rows(),W->cols());
+
+ if (parameter_update == "ADA") {
+ W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ //W_gradient.setZero(W->rows(),W->cols());
+ }
+ if (parameter_update == "ADAD") {
+ W_running_gradient.setZero(W->rows(),W->cols());
+ //W_gradient.setZero(W->rows(),W->cols());
+ W_running_parameter_update.setZero(W->rows(),W->cols());
+ }
+ initMatrix(engine,
+ *W,
+ init_normal,
+ init_range);
+ }
int n_inputs() const { return -1; }
int n_outputs() const { return W->cols() * context_size; }
@@ -765,40 +765,40 @@ class Input_word_embeddings
template <typename Dist>
void average(const Dist &dist, int output_id)
{
- W->row(output_id).setZero();
- for (int i=0; i < W->rows(); i++)
- if (i != output_id)
+ W->row(output_id).setZero();
+ for (int i=0; i < W->rows(); i++)
+ if (i != output_id)
W->row(output_id) += dist.prob(i) * W->row(i);
}
template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOut> &output) const
- {
- int embedding_dimension = W->cols();
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOut> &output) const
+ {
+ int embedding_dimension = W->cols();
- // W is vocab_size x embedding_dimension
- // input is ngram_size*vocab_size x minibatch_size
- // output is ngram_size*embedding_dimension x minibatch_size
+ // W is vocab_size x embedding_dimension
+ // input is ngram_size*vocab_size x minibatch_size
+ // output is ngram_size*embedding_dimension x minibatch_size
- /*
- // Dense version:
- for (int ngram=0; ngram<context_size; ngram++)
- output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
- */
+ /*
+ // Dense version:
+ for (int ngram=0; ngram<context_size; ngram++)
+ output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
+ */
- UNCONST(DerivedOut, output, my_output);
- my_output.setZero();
- for (int ngram=0; ngram<context_size; ngram++)
- {
- // input might be narrower than expected due to a short minibatch,
- // so narrow output to match
- uscgemm(1.0,
- W->transpose(),
- USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
- my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
- }
- }
+ UNCONST(DerivedOut, output, my_output);
+ my_output.setZero();
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ // input might be narrower than expected due to a short minibatch,
+ // so narrow output to match
+ uscgemm(1.0,
+ W->transpose(),
+ USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
+ my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
+ }
+ }
// When model is premultiplied, this layer doesn't get used,
// but this method is used to get the input into a sparse matrix.
@@ -814,206 +814,206 @@ class Input_word_embeddings
template <typename DerivedGOut, typename DerivedIn>
void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- double learning_rate, double momentum, double L2_reg)
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate, double momentum, double L2_reg)
{
- int embedding_dimension = W->cols();
+ int embedding_dimension = W->cols();
- // W is vocab_size x embedding_dimension
- // input is ngram_size*vocab_size x minibatch_size
- // bProp_input is ngram_size*embedding_dimension x minibatch_size
+ // W is vocab_size x embedding_dimension
+ // input is ngram_size*vocab_size x minibatch_size
+ // bProp_input is ngram_size*embedding_dimension x minibatch_size
- /*
- // Dense version:
- for (int ngram=0; ngram<context_size; ngram++)
- W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
- */
+ /*
+ // Dense version:
+ for (int ngram=0; ngram<context_size; ngram++)
+ W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
+ */
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(learning_rate,
- USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
- *W);
- }
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(learning_rate,
+ USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
+ *W);
+ }
- /*
- //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
- //PERFORM CLIPPING WHILE UPDATING
+ /*
+ //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+ //PERFORM CLIPPING WHILE UPDATING
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- W_gradient);
- }
- int_map update_map; //stores all the parameters that have been updated
- for (int ngram=0; ngram<context_size; ngram++)
- {
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(ngram,train_id)] = 1;
- }
- }
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
+ }
+ }
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate*
- W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
- //GRADIENT CLIPPING
- //W->row(update_item) += learning_rate*
- // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
- //SETTING THE GRADIENT TO ZERO
- W_gradient.row(update_item).setZero();
- }
- */
- }
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ {
+ update_items.push_back(it->first);
+ }
+ int num_items = update_items.size();
- template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- double learning_rate,
- double L2_reg)
+ #pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
{
- int embedding_dimension = W->cols();
- //W_gradient.setZero(W->rows(), W->cols());
- /*
+ int update_item = update_items[item_id];
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate*
+ W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+ //GRADIENT CLIPPING
+ //W->row(update_item) += learning_rate*
+ // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+ //SETTING THE GRADIENT TO ZERO
+ W_gradient.row(update_item).setZero();
+ }
+ */
+ }
+
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate,
+ double L2_reg)
+ {
+ int embedding_dimension = W->cols();
+ //W_gradient.setZero(W->rows(), W->cols());
+ /*
if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
- W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
- */
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- W_gradient);
- }
- int_map update_map; //stores all the parameters that have been updated
- for (int ngram=0; ngram<context_size; ngram++)
+ W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ */
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
{
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(ngram,train_id)] = 1;
- }
+ update_map[input_words(ngram,train_id)] = 1;
}
+ }
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
- W->row(update_item) += learning_rate *
- (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
- /*
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate *
- (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
- .unaryExpr(Clipper()).matrix();
- */
- W_gradient.row(update_item).setZero();
- }
- }
-
- template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- double learning_rate,
- double L2_reg,
- double conditioning_constant,
- double decay)
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
{
- int embedding_dimension = W->cols();
+ update_items.push_back(it->first);
+ }
+ int num_items = update_items.size();
- //W_gradient.setZero(W->rows(), W->cols());
+#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+ W->row(update_item) += learning_rate *
+ (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
/*
- if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
- W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate *
+ (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+ .unaryExpr(Clipper()).matrix();
*/
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- W_gradient);
- }
- int_map update_map; //stores all the parameters that have been updated
- for (int ngram=0; ngram<context_size; ngram++)
+ W_gradient.row(update_item).setZero();
+ }
+ }
+
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate,
+ double L2_reg,
+ double conditioning_constant,
+ double decay)
+ {
+ int embedding_dimension = W->cols();
+
+ //W_gradient.setZero(W->rows(), W->cols());
+ /*
+ if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+ W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ */
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
{
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(ngram,train_id)] = 1;
- }
+ update_map[input_words(ngram,train_id)] = 1;
}
+ }
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
-
- Array<double,1,Dynamic> W_current_parameter_update;
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
- (1.-decay)*W_gradient.row(update_item).array().square().matrix();
-
- W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
- (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
- W_gradient.row(update_item).array();
-
- //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
- //getchar();
- W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
- (1.-decay)*W_current_parameter_update.square().matrix();
-
- W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
- //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl;
- //getchar();
- W_gradient.row(update_item).setZero();
- }
-
- }
-
- template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
- void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- int x, int minibatch_size,
- const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
{
- UNCONST(DerivedGW, gradient, my_gradient);
- int embedding_dimension = W->cols();
- my_gradient.setZero();
- for (int ngram=0; ngram<context_size; ngram++)
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- my_gradient);
+ update_items.push_back(it->first);
}
+ int num_items = update_items.size();
+
+#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+
+ Array<double,1,Dynamic> W_current_parameter_update;
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+ (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+ W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+ (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+ W_gradient.row(update_item).array();
+
+ //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+ //getchar();
+ W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+ (1.-decay)*W_current_parameter_update.square().matrix();
+
+ W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+ //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl;
+ //getchar();
+ W_gradient.row(update_item).setZero();
+ }
+
+ }
+
+ template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+ void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ int x, int minibatch_size,
+ const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+ {
+ UNCONST(DerivedGW, gradient, my_gradient);
+ int embedding_dimension = W->cols();
+ my_gradient.setZero();
+ for (int ngram=0; ngram<context_size; ngram++)
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ my_gradient);
+ }
};
} // namespace nplm