Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/neuralClasses.h')
-rw-r--r--src/neuralClasses.h766
1 files changed, 633 insertions, 133 deletions
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 1b57763..949e445 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -7,7 +7,7 @@
#include <vector>
#include <boost/unordered_map.hpp>
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
#include "maybe_omp.h"
#include "util.h"
@@ -21,16 +21,26 @@
//#define EIGEN_DONT_PARALLELIZE
//#define EIGEN_DEFAULT_TO_ROW_MAJOR
+using namespace std;
namespace nplm
{
// is this cheating?
using Eigen::Matrix;
+using Eigen::Array;
using Eigen::MatrixBase;
using Eigen::Dynamic;
typedef boost::unordered_map<int,bool> int_map;
+struct Clipper{
+ double operator() (double x) const {
+ return std::min(0.5, std::max(x,-0.5));
+ //return(x);
+ }
+};
+
+
class Linear_layer
{
private:
@@ -38,6 +48,13 @@ class Linear_layer
Matrix<double,Dynamic,Dynamic> U_gradient;
Matrix<double,Dynamic,Dynamic> U_velocity;
Matrix<double,Dynamic,Dynamic> U_running_gradient;
+ Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+ // Biases
+ Matrix<double,Dynamic,1> b;
+ Matrix<double,Dynamic,1> b_velocity;
+ Matrix<double,Dynamic,1> b_running_gradient;
+ Matrix<double,Dynamic,1> b_running_parameter_update;
+ Matrix<double,Dynamic,1> b_gradient;
friend class model;
@@ -49,94 +66,222 @@ class Linear_layer
{
U.setZero(rows, cols);
U_gradient.setZero(rows, cols);
- U_running_gradient.setZero(rows, cols);
- U_velocity.setZero(rows, cols);
+ //U_running_gradient.setZero(rows, cols);
+ //U_running_parameter_updates.setZero(rows, cols);
+ //U_velocity.setZero(rows, cols);
+ b.resize(rows);
+ b_gradient.setZero(rows);
+ //b_running_gradient.resize(rows);
+ //b_velocity.resize(rows);
}
- void read(std::ifstream &U_file) { readMatrix(U_file, U); }
- void write(std::ofstream &U_file) { writeMatrix(U, U_file); }
+ void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
+ void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
+ void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+ void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
+
template <typename Engine>
- void initialize(Engine &engine, bool init_normal, double init_range)
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ string &parameter_update,
+ double adagrad_epsilon)
{
+ if (parameter_update == "ADA") {
+ U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ }
+ if (parameter_update == "ADAD") {
+ U_running_gradient.setZero(U.rows(),U.cols());
+ b_running_gradient.setZero(b.size());
+ U_running_parameter_update.setZero(U.rows(),U.cols());
+ b_running_parameter_update.setZero(b.size());
+ }
+
initMatrix(engine, U, init_normal, init_range);
+ initBias(engine, b, init_normal, init_range);
}
int n_inputs () const { return U.cols(); }
int n_outputs () const { return U.rows(); }
- template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
- {
- UNCONST(DerivedOut, output, my_output);
- my_output.leftCols(input.cols()).noalias() = U*input;
- }
+ template <typename DerivedIn, typename DerivedOut>
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOut> &output) const
+ {
+ UNCONST(DerivedOut, output, my_output);
+ my_output.leftCols(input.cols()).noalias() = U*input;
+ int num_examples = input.cols();
+ for (int example = 0;example < num_examples;example++)
+ {
+ my_output.leftCols(input.cols()).col(example) += b;
+ }
+ }
// Sparse input
template <typename ScalarIn, typename DerivedOut>
- void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const
+ void fProp(const USCMatrix<ScalarIn> &input,
+ const MatrixBase<DerivedOut> &output_const) const
{
UNCONST(DerivedOut, output_const, output);
output.setZero();
uscgemm(1.0, U, input, output.leftCols(input.cols()));
+ // Each column corresponds to a training example. We
+ // parallelize the adding of biases per dimension.
+ int num_examples = input.cols();
+ for (int example = 0;example < num_examples;example++)
+ {
+ output.leftCols(input.cols()).col(example) += b;
+ }
}
- template <typename DerivedGOut, typename DerivedGIn>
- void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const
- {
+ template <typename DerivedGOut, typename DerivedGIn>
+ void bProp(const MatrixBase<DerivedGOut> &input,
+ MatrixBase<DerivedGIn> &output) const
+ {
UNCONST(DerivedGIn, output, my_output);
my_output.noalias() = U.transpose()*input;
}
- template <typename DerivedGOut, typename DerivedIn>
- void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate, double momentum, double L2_reg)
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate, double momentum, double L2_reg)
+ {
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
+
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient = bProp_input.rowwise().sum();
+ // This used to be multithreaded, but there was no measureable difference
+ if (L2_reg > 0.0)
{
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
- // This used to be multithreaded, but there was no measureable difference
- if (L2_reg > 0.0)
- {
- U_gradient *= 1 - 2*L2_reg;
- }
- if (momentum > 0.0)
- {
- U_velocity = momentum*U_velocity + U_gradient;
- U += learning_rate * U_velocity;
- }
- else
- {
- U += learning_rate * U_gradient;
- }
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+ if (momentum > 0.0)
+ {
+ U_velocity = momentum*U_velocity + U_gradient;
+ U += learning_rate * U_velocity;
+ b_velocity = momentum*b_velocity + b_gradient;
+ b += learning_rate * b_velocity;
+ }
+ else
+ {
+ U += learning_rate * U_gradient;
+ b += learning_rate * b_gradient;
+ /*
+ //UPDATE CLIPPING
+ U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+ b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+ //GRADIENT CLIPPING
+ //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+ //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+ */
+ }
}
- template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate, double momentum, double L2_reg)
- {
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate,
+ double L2_reg)
+ {
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
- if (L2_reg != 0)
- {
- U_gradient *= 1 - 2*L2_reg;
- }
+
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
- // ignore momentum?
+ if (L2_reg != 0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+
+ // ignore momentum?
+ #pragma omp parallel for
+ for (int col=0; col<U.cols(); col++) {
+ U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+ U.col(col) += learning_rate * (U_gradient.col(col).array() /
+ U_running_gradient.col(col).array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+ unaryExpr(Clipper()).matrix();
+ */
+ }
+ b_running_gradient += b_gradient.array().square().matrix();
+ b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ */
+ }
- U_running_gradient.array() += U_gradient.array().square();
- U.array() += learning_rate * U_gradient.array() / U_running_gradient.array().sqrt();
- }
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate,
+ double L2_reg,
+ double conditioning_constant,
+ double decay)
+ {
+ //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
- template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
- void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- const MatrixBase<DerivedGW> &gradient) const
- {
- UNCONST(DerivedGW, gradient, my_gradient);
- my_gradient.noalias() = bProp_input*fProp_input.transpose();
- }
+ Array<double,Dynamic,1> b_current_parameter_update;
+
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+
+ if (L2_reg != 0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+
+ // ignore momentum?
+ #pragma omp parallel for
+ //cerr<<"U gradient is "<<U_gradient<<endl;
+ for (int col=0; col<U.cols(); col++) {
+ Array<double,Dynamic,1> U_current_parameter_update;
+ U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
+ (1-decay)*U_gradient.col(col).array().square().matrix();
+ //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+ //getchar();
+ U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+ (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+ U_gradient.col(col).array();
+ //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+ //getchar();
+ //update the running parameter update
+ U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+ (1.-decay)*U_current_parameter_update.square().matrix();
+ U.col(col) += learning_rate*U_current_parameter_update.matrix();
+ }
+ b_running_gradient = decay*b_running_gradient +
+ (1.-decay)*b_gradient.array().square().matrix();
+ b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (b_running_gradient.array()+conditioning_constant).sqrt()) *
+ b_gradient.array();
+ b_running_parameter_update = decay*(b_running_parameter_update) +
+ (1.-decay)*b_current_parameter_update.square().matrix();
+ b += learning_rate*b_current_parameter_update.matrix();
+ }
+
+
+ template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+ void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ const MatrixBase<DerivedGW> &gradient) const
+ {
+ UNCONST(DerivedGW, gradient, my_gradient);
+ my_gradient.noalias() = bProp_input*fProp_input.transpose();
+ }
};
class Output_word_embeddings
@@ -149,10 +294,12 @@ class Output_word_embeddings
Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
std::vector<double> W_data;
Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,Dynamic> W_running_gradient;
- Matrix<double,Dynamic,Dynamic> W_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
Matrix<double,Dynamic,1> b_running_gradient;
Matrix<double,Dynamic,1> b_gradient;
+ Matrix<double,Dynamic,1> b_running_parameter_update;
public:
Output_word_embeddings() { }
@@ -160,8 +307,8 @@ class Output_word_embeddings
void resize(int rows, int cols)
{
- W->setZero(rows, cols);
- b.setZero(rows);
+ W->setZero(rows, cols);
+ b.setZero(rows);
}
void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
W = input_W;
@@ -172,8 +319,31 @@ class Output_word_embeddings
void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
template <typename Engine>
- void initialize(Engine &engine, bool init_normal, double init_range, double init_bias)
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ double init_bias,
+ string &parameter_update,
+ double adagrad_epsilon)
{
+
+ W_gradient.setZero(W->rows(),W->cols());
+ b_gradient.setZero(b.size());
+ if (parameter_update == "ADA") {
+ W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ //W_gradient.setZero(W->rows(),W->cols());
+ //b_gradient.setZero(b.size());
+ }
+ if (parameter_update == "ADAD") {
+ W_running_gradient.setZero(W->rows(),W->cols());
+ b_running_gradient.setZero(b.size());
+ W_gradient.setZero(W->rows(),W->cols());
+ //b_gradient.setZero(b.size());
+ //W_running_parameter_update.setZero(W->rows(),W->cols());
+ b_running_parameter_update.setZero(b.size());
+ }
+
initMatrix(engine, *W, init_normal, init_range);
b.fill(init_bias);
}
@@ -198,8 +368,12 @@ class Output_word_embeddings
UNCONST(DerivedOutV, output, my_output);
#pragma omp parallel for
for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
- for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
- my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+ {
+ for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+ {
+ my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+ }
+ }
USCMatrix<double> sparse_output(W->rows(), samples, my_output);
uscgemm_masked(1.0, *W, input, sparse_output);
my_output = sparse_output.values; // too bad, so much copying
@@ -232,15 +406,86 @@ class Output_word_embeddings
void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOut> &bProp_input,
double learning_rate,
- double momentum) //not sure if we want to use momentum here
+ double momentum) //not sure if we want to use momentum here
{
// W is vocab_size x output_embedding_dimension
// b is vocab_size x 1
// predicted_embeddings is output_embedding_dimension x minibatch_size
// bProp_input is vocab_size x minibatch_size
-
W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
b += learning_rate * bProp_input.rowwise().sum();
+
+ /*
+ //GRADIENT CLIPPING
+ W->noalias() += learning_rate *
+ ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+ b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+ //UPDATE CLIPPING
+ W->noalias() += (learning_rate *
+ (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+ b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+ */
+ }
+
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradientAdagrad(
+ const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_sizea
+ W_gradient.setZero(W->rows(), W->cols());
+ b_gradient.setZero(b.size());
+ W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+ W_running_gradient += W_gradient.array().square().matrix();
+ b_running_gradient += b_gradient.array().square().matrix();
+ W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+ b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ */
+ }
+
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate,
+ double conditioning_constant,
+ double decay) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_size
+ Array<double,Dynamic,Dynamic> W_current_parameter_update;
+ Array<double,Dynamic,1> b_current_parameter_update;
+ W_gradient.setZero(W->rows(), W->cols());
+ b_gradient.setZero(b.size());
+ W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+ W_running_gradient = decay*W_running_gradient +
+ (1.-decay)*W_gradient.array().square().matrix();
+ b_running_gradient = decay*b_running_gradient+
+ (1.-decay)*b_gradient.array().square().matrix();
+ W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (W_running_gradient.array()+conditioning_constant).sqrt())*
+ W_gradient.array();
+ b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (b_running_gradient.array()+conditioning_constant).sqrt())*
+ b_gradient.array();
+ W_running_parameter_update = decay*W_running_parameter_update +
+ (1.-decay)*W_current_parameter_update.square().matrix();
+ b_running_parameter_update = decay*b_running_parameter_update +
+ (1.-decay)*b_current_parameter_update.square().matrix();
+
+ *W += learning_rate*W_current_parameter_update.matrix();
+ b += learning_rate*b_current_parameter_update.matrix();
}
// Sparse versions
@@ -264,6 +509,7 @@ class Output_word_embeddings
const MatrixBase<DerivedGOutV> &weights,
double learning_rate, double momentum) //not sure if we want to use momentum here
{
+ //cerr<<"in gradient"<<endl;
USCMatrix<double> gradient_output(W->rows(), samples, weights);
uscgemm(learning_rate,
gradient_output,
@@ -273,27 +519,64 @@ class Output_word_embeddings
gradient_output,
Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
b);
+ /*
+ //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+ //FIRST
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+ //#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+ //b(update_item) += learning_rate * b_gradient(update_item);
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+ double update = learning_rate * b_gradient(update_item);
+ b(update_item) += std::min(0.5, std::max(update,-0.5));
+ //GRADIENT CLIPPING
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
+ }
+ */
+ //cerr<<"Finished gradient"<<endl;
}
template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOutI> &samples,
const MatrixBase<DerivedGOutV> &weights,
- double learning_rate, double momentum) //not sure if we want to use momentum here
+ double learning_rate) //not sure if we want to use momentum here
{
- W_gradient.setZero(W->rows(), W->cols());
- b_gradient.setZero(b.size());
- if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
- W_running_gradient.setZero(W->rows(), W->cols());
- if (b_running_gradient.size() != b.size())
- b_running_gradient.setZero(b.size());
-
+ //W_gradient.setZero(W->rows(), W->cols());
+ //b_gradient.setZero(b.size());
+ //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(learning_rate,
+ uscgemm(1.0,
gradient_output,
predicted_embeddings.leftCols(samples.cols()).transpose(),
W_gradient);
- uscgemv(learning_rate, gradient_output,
+ uscgemv(1.0,
+ gradient_output,
Matrix<double,Dynamic,1>::Ones(weights.cols()),
b_gradient);
@@ -308,16 +591,98 @@ class Output_word_embeddings
update_items.push_back(it->first);
int num_items = update_items.size();
- #pragma omp parallel for
+ //#pragma omp parallel for
for (int item_id=0; item_id<num_items; item_id++)
{
int update_item = update_items[item_id];
- W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
+ W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
- W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
+ W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+ /*
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+ double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+ b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+ */
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
}
+ }
+
+ template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+ void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate,
+ double conditioning_constant,
+ double decay) //not sure if we want to use momentum here
+ {
+ //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+ //W_gradient.setZero(W->rows(), W->cols());
+ //b_gradient.setZero(b.size());
+
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+ #pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ Array<double,1,Dynamic> W_current_parameter_update;
+ double b_current_parameter_update;
+
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+ (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+ b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+ (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+ //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+ //getchar();
+
+ //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+ //getchar();
+ W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+ (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+ W_gradient.row(update_item).array();
+ b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+ sqrt(b_running_gradient(update_item)+conditioning_constant))*
+ b_gradient(update_item);
+ //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+ //getchar();
+ //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+ //getchar();
+ //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+ W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+ (1.-decay)*(W_current_parameter_update.square().matrix());
+ b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+ (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+ //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+ //getchar();
+ W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+ b(update_item) += learning_rate*b_current_parameter_update;
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
}
+ }
+
template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
@@ -345,8 +710,9 @@ class Input_word_embeddings
private:
Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
int context_size, vocab_size;
- Matrix<double,Dynamic,Dynamic> W_running_gradient;
- Matrix<double,Dynamic,Dynamic> W_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
friend class model;
@@ -354,29 +720,44 @@ class Input_word_embeddings
Input_word_embeddings() : context_size(0), vocab_size(0) { }
Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
- W = input_W;
- }
+ void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ W = input_W;
+ }
- void resize(int rows, int cols, int context)
- {
- context_size = context;
- vocab_size = rows;
- W->setZero(rows, cols);
- }
+ void resize(int rows, int cols, int context)
+ {
+ context_size = context;
+ vocab_size = rows;
+ W->setZero(rows, cols);
+ }
void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
- template <typename Engine>
- void initialize(Engine &engine, bool init_normal, double init_range)
- {
- initMatrix(engine,
- *W,
- init_normal,
- init_range);
+ template <typename Engine>
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+ W_gradient.setZero(W->rows(),W->cols());
+
+ if (parameter_update == "ADA") {
+ W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ //W_gradient.setZero(W->rows(),W->cols());
+ }
+ if (parameter_update == "ADAD") {
+ W_running_gradient.setZero(W->rows(),W->cols());
+ //W_gradient.setZero(W->rows(),W->cols());
+ W_running_parameter_update.setZero(W->rows(),W->cols());
}
-
+ initMatrix(engine,
+ *W,
+ init_normal,
+ init_range);
+ }
+
int n_inputs() const { return -1; }
int n_outputs() const { return W->cols() * context_size; }
@@ -436,7 +817,7 @@ class Input_word_embeddings
const MatrixBase<DerivedIn> &input_words,
double learning_rate, double momentum, double L2_reg)
{
- int embedding_dimension = W->cols();
+ int embedding_dimension = W->cols();
// W is vocab_size x embedding_dimension
// input is ngram_size*vocab_size x minibatch_size
@@ -453,59 +834,177 @@ class Input_word_embeddings
uscgemm(learning_rate,
USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
- *W);
+ *W);
+ }
+
+ /*
+ //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+ //PERFORM CLIPPING WHILE UPDATING
+
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
}
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
+ }
+ }
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ {
+ update_items.push_back(it->first);
+ }
+ int num_items = update_items.size();
+
+ #pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate*
+ W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+ //GRADIENT CLIPPING
+ //W->row(update_item) += learning_rate*
+ // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+ //SETTING THE GRADIENT TO ZERO
+ W_gradient.row(update_item).setZero();
+ }
+ */
}
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &input_words,
- double learning_rate, double momentum, double L2_reg)
+ double learning_rate,
+ double L2_reg)
{
int embedding_dimension = W->cols();
-
- W_gradient.setZero(W->rows(), W->cols());
- if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
- W_running_gradient.setZero(W->rows(), W->cols());
-
+ //W_gradient.setZero(W->rows(), W->cols());
+ /*
+ if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+ W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ */
for (int ngram=0; ngram<context_size; ngram++)
{
- uscgemm(learning_rate,
+ uscgemm(1.0,
USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
W_gradient);
}
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
+ }
+ }
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ {
+ update_items.push_back(it->first);
+ }
+ int num_items = update_items.size();
+
+ #pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+ W->row(update_item) += learning_rate *
+ (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate *
+ (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+ .unaryExpr(Clipper()).matrix();
+ */
+ W_gradient.row(update_item).setZero();
+ }
+ }
- int_map update_map; //stores all the parameters that have been updated
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate,
+ double L2_reg,
+ double conditioning_constant,
+ double decay)
+ {
+ int embedding_dimension = W->cols();
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(train_id)] = 1;
- }
+ //W_gradient.setZero(W->rows(), W->cols());
+ /*
+ if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+ W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ */
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
+ }
+ }
// Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
- W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
- }
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ {
+ update_items.push_back(it->first);
}
+ int num_items = update_items.size();
- template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
- void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- int x, int minibatch_size,
- const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+ #pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
{
+
+ Array<double,1,Dynamic> W_current_parameter_update;
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+ (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+ W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+ (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+ W_gradient.row(update_item).array();
+
+ //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+ //getchar();
+ W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+ (1.-decay)*W_current_parameter_update.square().matrix();
+
+ W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+ //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl;
+ //getchar();
+ W_gradient.row(update_item).setZero();
+ }
+
+ }
+
+ template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+ void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ int x, int minibatch_size,
+ const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+ {
UNCONST(DerivedGW, gradient, my_gradient);
int embedding_dimension = W->cols();
my_gradient.setZero();
@@ -514,7 +1013,8 @@ class Input_word_embeddings
USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
my_gradient);
- }
+ }
};
} // namespace nplm
+