Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2015-08-27 16:12:52 +0300
committerRico Sennrich <rico.sennrich@gmx.ch>2015-08-27 16:12:52 +0300
commit9dea3fe1329ce392b9e420c89663f7bddabb068a (patch)
treecec3017bade98c13fee73edd817d228cf4214abe
parent55bf63ddd73978b46e19f3d8c5606d5677cf560f (diff)
Makefile option (-DNPLM_DOUBLE_PRECISION) to switch between double and float.
-rw-r--r--src/Activation_function.h14
-rw-r--r--src/Makefile3
-rw-r--r--src/SoftmaxLoss.h22
-rw-r--r--src/USCMatrix.h8
-rw-r--r--src/clipper.h2
-rw-r--r--src/graphClasses.h8
-rw-r--r--src/model.cpp8
-rw-r--r--src/model.h18
-rw-r--r--src/neuralClasses.h190
-rw-r--r--src/neuralLM.h6
-rw-r--r--src/neuralNetwork.h20
-rw-r--r--src/neuralTM.h6
-rw-r--r--src/param.h24
-rw-r--r--src/propagator.h28
-rw-r--r--src/testNeuralLM.cpp12
-rw-r--r--src/testNeuralNetwork.cpp8
-rw-r--r--src/trainNeuralNetwork.cpp56
-rw-r--r--src/util.cpp2
-rw-r--r--src/util.h22
19 files changed, 236 insertions, 221 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h
index 742c2fc..2eec74d 100644
--- a/src/Activation_function.h
+++ b/src/Activation_function.h
@@ -43,27 +43,27 @@ inline std::string activation_function_to_string (activation_function_type f)
}
struct hardtanh_functor {
- double operator() (double x) const { if (x < -1.) return -1.; else if (x > 1.) return 1.; else return x; }
+ user_data_t operator() (user_data_t x) const { if (x < -1.) return -1.; else if (x > 1.) return 1.; else return x; }
};
struct dhardtanh_functor {
- double operator() (double x) const { return x > -1. && x < 1. ? 1. : 0.; }
+ user_data_t operator() (user_data_t x) const { return x > -1. && x < 1. ? 1. : 0.; }
};
struct tanh_functor {
- double operator() (double x) const { return std::tanh(x); }
+ user_data_t operator() (user_data_t x) const { return std::tanh(x); }
};
struct dtanh_functor {
- double operator() (double x) const { return 1-x*x; }
+ user_data_t operator() (user_data_t x) const { return 1-x*x; }
};
struct rectifier_functor {
- double operator() (double x) const { return std::max(x, 0.); }
+ user_data_t operator() (user_data_t x) const { return std::max(x, 0.); }
};
struct drectifier_functor {
- double operator() (double x) const { return x > 0. ? 1. : 0.; }
+ user_data_t operator() (user_data_t x) const { return x > 0. ? 1. : 0.; }
};
class Activation_function
@@ -78,7 +78,7 @@ class Activation_function
void set_activation_function(activation_function_type f) { this->f = f; }
template <typename Engine>
- void initialize(Engine &engine, bool init_normal, double init_range) { }
+ void initialize(Engine &engine, bool init_normal, user_data_t init_range) { }
int n_inputs () const { return size; }
int n_outputs () const { return size; }
diff --git a/src/Makefile b/src/Makefile
index 2a27405..fd2a665 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,6 +8,9 @@ CXX=g++
#CFLAGS=-g
CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG $(CXXFLAGS)
+# switch between single precision and double precision (single precision recommended on GPU for speed)
+CFLAGS+= -DNPLM_DOUBLE_PRECISION=0
+
# Architecture. Set to x86_64 or i686 to override.
ARCH:=$(shell uname -m)
# Operating system. Set to override (the only option that makes any difference is Darwin).
diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h
index d89cde6..cdd66c8 100644
--- a/src/SoftmaxLoss.h
+++ b/src/SoftmaxLoss.h
@@ -40,16 +40,16 @@ inline std::string loss_function_to_string (loss_function_type f)
struct SoftmaxLogLoss
{
template <typename DerivedI, typename DerivedW, typename DerivedO>
- void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+ void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, user_data_t &loss)
{
UNCONST(DerivedO, output_const, output);
- double log_likelihood = 0.0;
+ user_data_t log_likelihood = 0.0;
#pragma omp parallel for reduction(+:log_likelihood)
for (int train_id = 0; train_id < input.cols(); train_id++)
{
- double normalization = logsum(input.col(train_id));
+ user_data_t normalization = logsum(input.col(train_id));
output.col(train_id).array() = input.col(train_id).array() - normalization;
log_likelihood += output(output_words(train_id), train_id);
}
@@ -91,12 +91,12 @@ class SoftmaxNCELoss
template <typename DerivedI, typename DerivedW, typename DerivedO>
void fProp(const MatrixBase<DerivedI> &scores,
const MatrixBase<DerivedW> &minibatch_samples,
- const MatrixBase<DerivedO> &output_const, double &loss)
+ const MatrixBase<DerivedO> &output_const, user_data_t &loss)
{
UNCONST(DerivedO, output_const, output);
- double log_likelihood = 0.0;
+ user_data_t log_likelihood = 0.0;
int num_noise_samples = minibatch_samples.rows()-1;
- double log_num_noise_samples = std::log(num_noise_samples);
+ user_data_t log_num_noise_samples = std::log(num_noise_samples);
#pragma omp parallel for reduction(+:log_likelihood) schedule(static)
for (int train_id = 0; train_id < scores.cols(); train_id++)
{
@@ -106,11 +106,11 @@ class SoftmaxNCELoss
// To avoid zero or infinite probabilities,
// never take exp of score without normalizing first,
// even if it's a little slower...
- double score = scores(sample_id, train_id);
- double score_noise = log_num_noise_samples + unigram.logprob(sample);
- double z = logadd(score, score_noise);
- double logprob = score - z;
- double logprob_noise = score_noise - z;
+ user_data_t score = scores(sample_id, train_id);
+ user_data_t score_noise = log_num_noise_samples + unigram.logprob(sample);
+ user_data_t z = logadd(score, score_noise);
+ user_data_t logprob = score - z;
+ user_data_t logprob_noise = score_noise - z;
output(sample_id, train_id) = std::exp(logprob);
log_likelihood += sample_id == 0 ? logprob : logprob_noise;
}
diff --git a/src/USCMatrix.h b/src/USCMatrix.h
index 784fa1b..821703a 100644
--- a/src/USCMatrix.h
+++ b/src/USCMatrix.h
@@ -70,7 +70,7 @@ class USCMatrix
// Dense matrix - sparse matrix product
// a is presumably very wide
template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC>
-void uscgemm(double alpha, const MatrixBase<DerivedA> &a,
+void uscgemm(user_data_t alpha, const MatrixBase<DerivedA> &a,
const USCMatrix<ScalarB,Index> &b,
const MatrixBase<DerivedC> &c_const)
{
@@ -92,7 +92,7 @@ void uscgemm(double alpha, const MatrixBase<DerivedA> &a,
// sparse matrix - dense matrix product
template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemm(double alpha,
+void uscgemm(user_data_t alpha,
const USCMatrix<ScalarA,Index> &a,
const MatrixBase<DerivedB> &b,
const MatrixBase<DerivedC> &c_const)
@@ -145,7 +145,7 @@ void uscgemm(double alpha,
// For b, column-major is preferred.
template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index>
-void uscgemm_masked(double alpha,
+void uscgemm_masked(user_data_t alpha,
const MatrixBase<DerivedA> &a,
const MatrixBase<DerivedB> &b,
USCMatrix<ScalarC,Index> &c)
@@ -167,7 +167,7 @@ void uscgemm_masked(double alpha,
// sparse matrix - dense vector product
template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemv(double alpha,
+void uscgemv(user_data_t alpha,
const USCMatrix<ScalarA,Index> &a,
const MatrixBase<DerivedB> &b,
const MatrixBase<DerivedC> &c_const)
diff --git a/src/clipper.h b/src/clipper.h
index dda5c4d..e8c96a6 100644
--- a/src/clipper.h
+++ b/src/clipper.h
@@ -3,7 +3,7 @@
namespace nplm {
struct Clipper{
- double operator() (double x) const {
+ user_data_t operator() (user_data_t x) const {
return std::min(0.5, std::max(x,-0.5));
//return(x);
}
diff --git a/src/graphClasses.h b/src/graphClasses.h
index cd80a4c..1282048 100644
--- a/src/graphClasses.h
+++ b/src/graphClasses.h
@@ -14,8 +14,8 @@ class Node {
X * param; //what parameter is this
//vector <void *> children;
//vector <void *> parents;
- Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
- Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
+ Eigen::Matrix<user_data_t,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
+ Eigen::Matrix<user_data_t,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
int minibatch_size;
public:
@@ -44,11 +44,11 @@ class Node {
void resize() { resize(minibatch_size); }
/*
- void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
+ void Fprop(Matrix<user_data_t,Dynamic,Dynamic> & input,int n_cols)
{
param->fProp(input,fProp_matrix,0,0,n_cols);
}
- void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
+ void Fprop(Matrix<user_data_t,1,Dynamic> & input,int n_cols)
{
param->fProp(input,fProp_matrix,0,0,n_cols);
}
diff --git a/src/model.cpp b/src/model.cpp
index db7f006..9a82faf 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -46,10 +46,10 @@ void model::resize(int ngram_size,
void model::initialize(boost::random::mt19937 &init_engine,
bool init_normal,
- double init_range,
- double init_bias,
+ user_data_t init_range,
+ user_data_t init_bias,
string &parameter_update,
- double adagrad_epsilon)
+ user_data_t adagrad_epsilon)
{
input_layer.initialize(init_engine,
init_normal,
@@ -79,7 +79,7 @@ void model::premultiply()
// Since input and first_hidden_linear are both linear,
// we can multiply them into a single linear layer *if* we are not training
int context_size = ngram_size-1;
- Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
+ Matrix<user_data_t,Dynamic,Dynamic> U = first_hidden_linear.U;
if (num_hidden == 0)
{
first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
diff --git a/src/model.h b/src/model.h
index 3cce06a..4eaf5f5 100644
--- a/src/model.h
+++ b/src/model.h
@@ -20,7 +20,7 @@ public:
Linear_layer second_hidden_linear;
Activation_function second_hidden_activation;
Output_word_embeddings output_layer;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix,
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix,
input_embedding_matrix,
input_and_output_embedding_matrix;
@@ -37,13 +37,13 @@ public:
bool share_embeddings)
{
if (share_embeddings){
- input_and_output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>();
+ input_and_output_embedding_matrix = Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>();
input_layer.set_W(&input_and_output_embedding_matrix);
output_layer.set_W(&input_and_output_embedding_matrix);
}
else {
- input_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>();
- output_embedding_matrix = Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>();
+ input_embedding_matrix = Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>();
+ output_embedding_matrix = Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>();
input_layer.set_W(&input_embedding_matrix);
output_layer.set_W(&output_embedding_matrix);
}
@@ -57,8 +57,8 @@ public:
model() : ngram_size(1),
premultiplied(false),
activation_function(Rectifier),
- output_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>()),
- input_embedding_matrix(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor>())
+ output_embedding_matrix(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>()),
+ input_embedding_matrix(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor>())
{
output_layer.set_W(&output_embedding_matrix);
input_layer.set_W(&input_embedding_matrix);
@@ -73,10 +73,10 @@ public:
void initialize(boost::random::mt19937 &init_engine,
bool init_normal,
- double init_range,
- double init_bias,
+ user_data_t init_range,
+ user_data_t init_bias,
string &parameter_udpate,
- double adagrad_epsilon);
+ user_data_t adagrad_epsilon);
void set_activation_function(activation_function_type f)
{
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 9ead99f..182db0d 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -34,7 +34,7 @@ using Eigen::Dynamic;
typedef boost::unordered_map<int,bool> int_map;
struct Clipper{
- double operator() (double x) const {
+ user_data_t operator() (user_data_t x) const {
return std::min(0.5, std::max(x,-0.5));
//return(x);
}
@@ -44,17 +44,17 @@ struct Clipper{
class Linear_layer
{
private:
- Matrix<double,Dynamic,Dynamic> U;
- Matrix<double,Dynamic,Dynamic> U_gradient;
- Matrix<double,Dynamic,Dynamic> U_velocity;
- Matrix<double,Dynamic,Dynamic> U_running_gradient;
- Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+ Matrix<user_data_t,Dynamic,Dynamic> U;
+ Matrix<user_data_t,Dynamic,Dynamic> U_gradient;
+ Matrix<user_data_t,Dynamic,Dynamic> U_velocity;
+ Matrix<user_data_t,Dynamic,Dynamic> U_running_gradient;
+ Matrix<user_data_t,Dynamic,Dynamic> U_running_parameter_update;
// Biases
- Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,1> b_velocity;
- Matrix<double,Dynamic,1> b_running_gradient;
- Matrix<double,Dynamic,1> b_running_parameter_update;
- Matrix<double,Dynamic,1> b_gradient;
+ Matrix<user_data_t,Dynamic,1> b;
+ Matrix<user_data_t,Dynamic,1> b_velocity;
+ Matrix<user_data_t,Dynamic,1> b_running_gradient;
+ Matrix<user_data_t,Dynamic,1> b_running_parameter_update;
+ Matrix<user_data_t,Dynamic,1> b_gradient;
friend class model;
@@ -84,13 +84,13 @@ class Linear_layer
template <typename Engine>
void initialize(Engine &engine,
bool init_normal,
- double init_range,
+ user_data_t init_range,
string &parameter_update,
- double adagrad_epsilon)
+ user_data_t adagrad_epsilon)
{
if (parameter_update == "ADA") {
- U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
- b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ U_running_gradient = Matrix<user_data_t,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<user_data_t,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
}
if (parameter_update == "ADAD") {
U_running_gradient.setZero(U.rows(),U.cols());
@@ -147,7 +147,7 @@ class Linear_layer
template <typename DerivedGOut, typename DerivedIn>
void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate, double momentum, double L2_reg)
+ user_data_t learning_rate, user_data_t momentum, user_data_t L2_reg)
{
U_gradient.noalias() = bProp_input*fProp_input.transpose();
@@ -185,8 +185,8 @@ class Linear_layer
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate,
- double L2_reg)
+ user_data_t learning_rate,
+ user_data_t L2_reg)
{
U_gradient.noalias() = bProp_input*fProp_input.transpose();
@@ -224,15 +224,15 @@ class Linear_layer
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate,
- double L2_reg,
- double conditioning_constant,
- double decay)
+ user_data_t learning_rate,
+ user_data_t L2_reg,
+ user_data_t conditioning_constant,
+ user_data_t decay)
{
//cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
U_gradient.noalias() = bProp_input*fProp_input.transpose();
- Array<double,Dynamic,1> b_current_parameter_update;
+ Array<user_data_t,Dynamic,1> b_current_parameter_update;
// get the bias gradient for all dimensions in parallel
int size = b.size();
@@ -248,7 +248,7 @@ class Linear_layer
#pragma omp parallel for
//cerr<<"U gradient is "<<U_gradient<<endl;
for (int col=0; col<U.cols(); col++) {
- Array<double,Dynamic,1> U_current_parameter_update;
+ Array<user_data_t,Dynamic,1> U_current_parameter_update;
U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
(1-decay)*U_gradient.col(col).array().square().matrix();
//cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
@@ -288,18 +288,18 @@ class Output_word_embeddings
{
private:
// row-major is better for uscgemm
- //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
+ //Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W;
// Having W be a pointer to a matrix allows ease of sharing
// input and output word embeddings
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
- std::vector<double> W_data;
- Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
- Matrix<double,Dynamic,1> b_running_gradient;
- Matrix<double,Dynamic,1> b_gradient;
- Matrix<double,Dynamic,1> b_running_parameter_update;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *W;
+ std::vector<user_data_t> W_data;
+ Matrix<user_data_t,Dynamic,1> b;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<user_data_t,Dynamic,1> b_running_gradient;
+ Matrix<user_data_t,Dynamic,1> b_gradient;
+ Matrix<user_data_t,Dynamic,1> b_running_parameter_update;
public:
Output_word_embeddings() { }
@@ -310,7 +310,7 @@ class Output_word_embeddings
W->setZero(rows, cols);
b.setZero(rows);
}
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ void set_W(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
W = input_W;
}
void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
@@ -321,17 +321,17 @@ class Output_word_embeddings
template <typename Engine>
void initialize(Engine &engine,
bool init_normal,
- double init_range,
- double init_bias,
+ user_data_t init_range,
+ user_data_t init_bias,
string &parameter_update,
- double adagrad_epsilon)
+ user_data_t adagrad_epsilon)
{
W_gradient.setZero(W->rows(),W->cols());
b_gradient.setZero(b.size());
if (parameter_update == "ADA") {
- W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
- b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ W_running_gradient = Matrix<user_data_t,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<user_data_t,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
//W_gradient.setZero(W->rows(),W->cols());
//b_gradient.setZero(b.size());
}
@@ -359,9 +359,9 @@ class Output_word_embeddings
my_output = ((*W) * input).colwise() + b;
/* TODO: without EIGEN_NO_DEBUG - is this a bug?
ProductBase.h:102: Eigen::ProductBase<Derived, Lhs, Rhs>::ProductBase(const Lhs&
- , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<double, -1, -1
- , 1>, Eigen::Matrix<double, -1, -1>, 5>; Lhs = Eigen::Matrix<double, -1, -1, 1>;
- Rhs = Eigen::Matrix<double, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() &
+ , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<user_data_t, -1, -1
+ , 1>, Eigen::Matrix<user_data_t, -1, -1>, 5>; Lhs = Eigen::Matrix<user_data_t, -1, -1, 1>;
+ Rhs = Eigen::Matrix<user_data_t, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() &
& "invalid matrix product" && "if you wanted a coeff-wise or a dot product use t
he respective explicit functions"' failed.
@@ -394,14 +394,14 @@ class Output_word_embeddings
my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
}
}
- USCMatrix<double> sparse_output(W->rows(), samples, my_output);
+ USCMatrix<user_data_t> sparse_output(W->rows(), samples, my_output);
uscgemm_masked(1.0, *W, input, sparse_output);
my_output = sparse_output.values; // too bad, so much copying
}
// Return single element of output matrix
template <typename DerivedIn>
- double fProp(const MatrixBase<DerivedIn> &input,
+ user_data_t fProp(const MatrixBase<DerivedIn> &input,
int word,
int instance) const
{
@@ -425,8 +425,8 @@ class Output_word_embeddings
template <typename DerivedIn, typename DerivedGOut>
void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate,
- double momentum) //not sure if we want to use momentum here
+ user_data_t learning_rate,
+ user_data_t momentum) //not sure if we want to use momentum here
{
// W is vocab_size x output_embedding_dimension
// b is vocab_size x 1
@@ -451,7 +451,7 @@ class Output_word_embeddings
void computeGradientAdagrad(
const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate) //not sure if we want to use momentum here
+ user_data_t learning_rate) //not sure if we want to use momentum here
{
// W is vocab_size x output_embedding_dimension
// b is vocab_size x 1
@@ -475,16 +475,16 @@ class Output_word_embeddings
template <typename DerivedIn, typename DerivedGOut>
void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate,
- double conditioning_constant,
- double decay) //not sure if we want to use momentum here
+ user_data_t learning_rate,
+ user_data_t conditioning_constant,
+ user_data_t decay) //not sure if we want to use momentum here
{
// W is vocab_size x output_embedding_dimension
// b is vocab_size x 1
// predicted_embeddings is output_embedding_dimension x minibatch_size
// bProp_input is vocab_size x minibatch_size
- Array<double,Dynamic,Dynamic> W_current_parameter_update;
- Array<double,Dynamic,1> b_current_parameter_update;
+ Array<user_data_t,Dynamic,Dynamic> W_current_parameter_update;
+ Array<user_data_t,Dynamic,1> b_current_parameter_update;
W_gradient.setZero(W->rows(), W->cols());
b_gradient.setZero(b.size());
W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
@@ -519,7 +519,7 @@ class Output_word_embeddings
my_bProp_matrix.setZero();
uscgemm(1.0,
W->transpose(),
- USCMatrix<double>(W->rows(), samples, weights),
+ USCMatrix<user_data_t>(W->rows(), samples, weights),
my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
}
@@ -527,29 +527,29 @@ class Output_word_embeddings
void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOutI> &samples,
const MatrixBase<DerivedGOutV> &weights,
- double learning_rate, double momentum) //not sure if we want to use momentum here
+ user_data_t learning_rate, user_data_t momentum) //not sure if we want to use momentum here
{
//cerr<<"in gradient"<<endl;
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights);
uscgemm(learning_rate,
gradient_output,
predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
*W); // narrow predicted_embeddings for possible short minibatch
uscgemv(learning_rate,
gradient_output,
- Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+ Matrix<user_data_t,Dynamic,1>::Ones(gradient_output.cols()),
b);
/*
//IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
//FIRST
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights);
uscgemm(1.0,
gradient_output,
predicted_embeddings.leftCols(samples.cols()).transpose(),
W_gradient);
uscgemv(1.0,
gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()),
b_gradient);
int_map update_map; //stores all the parameters that have been updated
@@ -571,7 +571,7 @@ class Output_word_embeddings
//b(update_item) += learning_rate * b_gradient(update_item);
//UPDATE CLIPPING
W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
- double update = learning_rate * b_gradient(update_item);
+ user_data_t update = learning_rate * b_gradient(update_item);
b(update_item) += std::min(0.5, std::max(update,-0.5));
//GRADIENT CLIPPING
W_gradient.row(update_item).setZero();
@@ -585,19 +585,19 @@ class Output_word_embeddings
void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOutI> &samples,
const MatrixBase<DerivedGOutV> &weights,
- double learning_rate) //not sure if we want to use momentum here
+ user_data_t learning_rate) //not sure if we want to use momentum here
{
//W_gradient.setZero(W->rows(), W->cols());
//b_gradient.setZero(b.size());
//FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights);
uscgemm(1.0,
gradient_output,
predicted_embeddings.leftCols(samples.cols()).transpose(),
W_gradient);
uscgemv(1.0,
gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()),
b_gradient);
int_map update_map; //stores all the parameters that have been updated
@@ -622,7 +622,7 @@ class Output_word_embeddings
/*
//UPDATE CLIPPING
W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
- double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+ user_data_t update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
*/
W_gradient.row(update_item).setZero();
@@ -634,22 +634,22 @@ class Output_word_embeddings
void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
const MatrixBase<DerivedGOutI> &samples,
const MatrixBase<DerivedGOutV> &weights,
- double learning_rate,
- double conditioning_constant,
- double decay) //not sure if we want to use momentum here
+ user_data_t learning_rate,
+ user_data_t conditioning_constant,
+ user_data_t decay) //not sure if we want to use momentum here
{
//cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
//W_gradient.setZero(W->rows(), W->cols());
//b_gradient.setZero(b.size());
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights);
uscgemm(1.0,
gradient_output,
predicted_embeddings.leftCols(samples.cols()).transpose(),
W_gradient);
uscgemv(1.0,
gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()),
b_gradient);
int_map update_map; //stores all the parameters that have been updated
@@ -666,8 +666,8 @@ class Output_word_embeddings
#pragma omp parallel for
for (int item_id=0; item_id<num_items; item_id++)
{
- Array<double,1,Dynamic> W_current_parameter_update;
- double b_current_parameter_update;
+ Array<user_data_t,1,Dynamic> W_current_parameter_update;
+ user_data_t b_current_parameter_update;
int update_item = update_items[item_id];
W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
@@ -715,24 +715,24 @@ class Output_word_embeddings
UNCONST(DerivedGb, gradient_b, my_gradient_b);
my_gradient_W.setZero();
my_gradient_b.setZero();
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ USCMatrix<user_data_t> gradient_output(W->rows(), samples, weights);
uscgemm(1.0,
gradient_output,
predicted_embeddings.leftCols(samples.cols()).transpose(),
my_gradient_W);
uscgemv(1.0, gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+ Matrix<user_data_t,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
}
};
class Input_word_embeddings
{
private:
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *W;
int context_size, vocab_size;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
friend class model;
@@ -740,7 +740,7 @@ class Input_word_embeddings
Input_word_embeddings() : context_size(0), vocab_size(0) { }
Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ void set_W(Matrix<user_data_t,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
W = input_W;
}
@@ -762,14 +762,14 @@ class Input_word_embeddings
template <typename Engine>
void initialize(Engine &engine,
bool init_normal,
- double init_range,
+ user_data_t init_range,
string &parameter_update,
- double adagrad_epsilon)
+ user_data_t adagrad_epsilon)
{
W_gradient.setZero(W->rows(),W->cols());
if (parameter_update == "ADA") {
- W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ W_running_gradient = Matrix<user_data_t,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
//W_gradient.setZero(W->rows(),W->cols());
}
if (parameter_update == "ADAD") {
@@ -820,7 +820,7 @@ class Input_word_embeddings
// so narrow output to match
uscgemm(1.0,
W->transpose(),
- USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
+ USCMatrix<user_data_t>(W->rows(),input.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input.cols())),
my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
}
}
@@ -840,7 +840,7 @@ class Input_word_embeddings
template <typename DerivedGOut, typename DerivedIn>
void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &input_words,
- double learning_rate, double momentum, double L2_reg)
+ user_data_t learning_rate, user_data_t momentum, user_data_t L2_reg)
{
int embedding_dimension = W->cols();
@@ -857,7 +857,7 @@ class Input_word_embeddings
for (int ngram=0; ngram<context_size; ngram++)
{
uscgemm(learning_rate,
- USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ USCMatrix<user_data_t>(W->rows(), input_words.middleRows(ngram, 1), Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
*W);
}
@@ -869,7 +869,7 @@ class Input_word_embeddings
for (int ngram=0; ngram<context_size; ngram++)
{
uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
W_gradient);
}
@@ -909,8 +909,8 @@ class Input_word_embeddings
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &input_words,
- double learning_rate,
- double L2_reg)
+ user_data_t learning_rate,
+ user_data_t L2_reg)
{
int embedding_dimension = W->cols();
//W_gradient.setZero(W->rows(), W->cols());
@@ -921,7 +921,7 @@ class Input_word_embeddings
for (int ngram=0; ngram<context_size; ngram++)
{
uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
W_gradient);
}
@@ -962,10 +962,10 @@ class Input_word_embeddings
template <typename DerivedGOut, typename DerivedIn>
void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
const MatrixBase<DerivedIn> &input_words,
- double learning_rate,
- double L2_reg,
- double conditioning_constant,
- double decay)
+ user_data_t learning_rate,
+ user_data_t L2_reg,
+ user_data_t conditioning_constant,
+ user_data_t decay)
{
int embedding_dimension = W->cols();
@@ -977,7 +977,7 @@ class Input_word_embeddings
for (int ngram=0; ngram<context_size; ngram++)
{
uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
W_gradient);
}
@@ -1002,7 +1002,7 @@ class Input_word_embeddings
for (int item_id=0; item_id<num_items; item_id++)
{
- Array<double,1,Dynamic> W_current_parameter_update;
+ Array<user_data_t,1,Dynamic> W_current_parameter_update;
int update_item = update_items[item_id];
W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
(1.-decay)*W_gradient.row(update_item).array().square().matrix();
@@ -1035,7 +1035,7 @@ class Input_word_embeddings
my_gradient.setZero();
for (int ngram=0; ngram<context_size; ngram++)
uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ USCMatrix<user_data_t>(W->rows(),input_words.middleRows(ngram, 1),Matrix<user_data_t,1,Dynamic>::Ones(input_words.cols())),
bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
my_gradient);
}
diff --git a/src/neuralLM.h b/src/neuralLM.h
index f0eebd8..a9a0dfe 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -80,7 +80,7 @@ class neuralLM : public neuralNetwork, graehl::replace_digits
return vocab->lookup_word(slice);
}
- double lookup_ngram(const int *ngram_a, int n)
+ user_data_t lookup_ngram(const int *ngram_a, int n)
{
Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
for (int i=0; i<m->ngram_size; ++i)
@@ -100,13 +100,13 @@ class neuralLM : public neuralNetwork, graehl::replace_digits
return neuralNetwork::lookup_ngram(ngram);
}
- double lookup_ngram(const std::vector<int> &ngram_v)
+ user_data_t lookup_ngram(const std::vector<int> &ngram_v)
{
return lookup_ngram(ngram_v.data(), ngram_v.size());
}
template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ user_data_t lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
{
return neuralNetwork::lookup_ngram(ngram);
}
diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h
index 6386a0f..c5d76dc 100644
--- a/src/neuralNetwork.h
+++ b/src/neuralNetwork.h
@@ -20,13 +20,13 @@ class neuralNetwork
private:
bool normalization;
- double weight;
+ user_data_t weight;
propagator prop;
std::size_t cache_size;
Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
- std::vector<double> cache_values;
+ std::vector<user_data_t> cache_values;
int cache_lookups, cache_hits;
public:
@@ -40,7 +40,7 @@ class neuralNetwork
}
void set_normalization(bool value) { normalization = value; }
- void set_log_base(double value) { weight = 1./std::log(value); }
+ void set_log_base(user_data_t value) { weight = 1./std::log(value); }
// This must be called if the underlying model is resized.
void resize() {
@@ -58,7 +58,7 @@ class neuralNetwork
}
template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ user_data_t lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
{
assert (ngram.rows() == m->ngram_size);
assert (ngram.cols() == 1);
@@ -90,17 +90,17 @@ class neuralNetwork
prop.fProp(ngram.col(0));
int output = ngram(m->ngram_size-1, 0);
- double log_prob;
+ user_data_t log_prob;
start_timer(3);
if (normalization)
{
- Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
+ Eigen::Matrix<user_data_t,Eigen::Dynamic,1> scores(m->output_vocab_size);
if (prop.skip_hidden)
prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
else
prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
- double logz = logsum(scores.col(0));
+ user_data_t logz = logsum(scores.col(0));
log_prob = weight * (scores(output, 0) - logz);
}
else
@@ -140,15 +140,15 @@ class neuralNetwork
if (normalization)
{
- Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
+ Eigen::Matrix<user_data_t,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
if (prop.skip_hidden)
prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
else
prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
// And softmax and loss
- Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
- double minibatch_log_likelihood;
+ Matrix<user_data_t,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
+ user_data_t minibatch_log_likelihood;
SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
for (int j=0; j<ngram.cols(); j++)
{
diff --git a/src/neuralTM.h b/src/neuralTM.h
index 9bb6d16..9482161 100644
--- a/src/neuralTM.h
+++ b/src/neuralTM.h
@@ -91,7 +91,7 @@ class neuralTM : public neuralNetwork, graehl::replace_digits
return lookup_word(word, *output_vocab);
}
- double lookup_ngram(const int *ngram_a, int n)
+ user_data_t lookup_ngram(const int *ngram_a, int n)
{
Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
for (int i=0; i<m->ngram_size; i++)
@@ -111,13 +111,13 @@ class neuralTM : public neuralNetwork, graehl::replace_digits
return neuralNetwork::lookup_ngram(ngram);
}
- double lookup_ngram(const std::vector<int> &ngram_v)
+ user_data_t lookup_ngram(const std::vector<int> &ngram_v)
{
return lookup_ngram(ngram_v.data(), ngram_v.size());
}
template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ user_data_t lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
{
return neuralNetwork::lookup_ngram(ngram);
}
diff --git a/src/param.h b/src/param.h
index 9ec486b..fe1b6d6 100644
--- a/src/param.h
+++ b/src/param.h
@@ -3,6 +3,12 @@
#include <string>
+#ifdef NPLM_DOUBLE_PRECISION
+typedef double user_data_t;
+#else
+typedef float user_data_t;
+#endif
+
namespace nplm
{
@@ -35,26 +41,26 @@ struct param
int minibatch_size;
int validation_minibatch_size;
int num_epochs;
- double learning_rate;
- double conditioning_constant;
- double decay;
- double adagrad_epsilon;
+ user_data_t learning_rate;
+ user_data_t conditioning_constant;
+ user_data_t decay;
+ user_data_t adagrad_epsilon;
bool init_normal;
- double init_range;
+ user_data_t init_range;
int num_noise_samples;
bool use_momentum;
- double initial_momentum;
- double final_momentum;
+ user_data_t initial_momentum;
+ user_data_t final_momentum;
- double L2_reg;
+ user_data_t L2_reg;
double input_dropout;
int null_index;
bool normalization;
- double normalization_init;
+ user_data_t normalization_init;
int num_threads;
int debug;
diff --git a/src/propagator.h b/src/propagator.h
index 6344f2f..45d4018 100644
--- a/src/propagator.h
+++ b/src/propagator.h
@@ -71,7 +71,7 @@ class propagator {
else
{
int n_inputs = first_hidden_linear_node.param->n_inputs();
- USCMatrix<double> sparse_data;
+ USCMatrix<user_data_t> sparse_data;
input_layer_node.param->munge(data, sparse_data);
start_timer(1);
@@ -101,12 +101,12 @@ class propagator {
template <typename DerivedIn, typename DerivedOut>
void bProp(const MatrixBase<DerivedIn> &data,
const MatrixBase<DerivedOut> &output,
- double learning_rate,
- double momentum,
- double L2_reg,
+ user_data_t learning_rate,
+ user_data_t momentum,
+ user_data_t L2_reg,
std::string &parameter_update,
- double conditioning_constant,
- double decay)
+ user_data_t conditioning_constant,
+ user_data_t decay)
{
// Output embedding layer
@@ -153,12 +153,12 @@ class propagator {
void bProp(const MatrixBase<DerivedIn> &data,
const MatrixBase<DerivedOutI> &samples,
const MatrixBase<DerivedOutV> &weights,
- double learning_rate,
- double momentum,
- double L2_reg,
+ user_data_t learning_rate,
+ user_data_t momentum,
+ user_data_t L2_reg,
std::string &parameter_update,
- double conditioning_constant,
- double decay)
+ user_data_t conditioning_constant,
+ user_data_t decay)
{
// Output embedding layer
@@ -210,10 +210,10 @@ class propagator {
private:
template <typename DerivedIn>
void bPropRest(const MatrixBase<DerivedIn> &data,
- double learning_rate, double momentum, double L2_reg,
+ user_data_t learning_rate, user_data_t momentum, user_data_t L2_reg,
std::string &parameter_update,
- double conditioning_constant,
- double decay)
+ user_data_t conditioning_constant,
+ user_data_t decay)
{
// Second hidden layer
diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp
index abaab34..a2aa5e3 100644
--- a/src/testNeuralLM.cpp
+++ b/src/testNeuralLM.cpp
@@ -20,7 +20,7 @@ using namespace Eigen;
using namespace nplm;
void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams,
- vector<double> &out) {
+ vector<user_data_t> &out) {
if (ngrams.size() == 0) return;
int ngram_size = ngrams[0].size();
@@ -29,7 +29,7 @@ void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<
// Score one n-gram at a time. This is how the LM would be queried from a decoder.
for (int sent_id=0; sent_id<start.size()-1; sent_id++)
{
- double sent_log_prob = 0.0;
+ user_data_t sent_log_prob = 0.0;
for (int j=start[sent_id]; j<start[sent_id+1]; j++)
sent_log_prob += lm.lookup_ngram(ngrams[j]);
out.push_back(sent_log_prob);
@@ -38,7 +38,7 @@ void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<
else
{
// Score a whole minibatch at a time.
- Matrix<double,1,Dynamic> log_probs(ngrams.size());
+ Matrix<user_data_t,1,Dynamic> log_probs(ngrams.size());
Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
minibatch.setZero();
@@ -52,7 +52,7 @@ void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<
for (int sent_id=0; sent_id<start.size()-1; sent_id++)
{
- double sent_log_prob = 0.0;
+ user_data_t sent_log_prob = 0.0;
for (int j=start[sent_id]; j<start[sent_id+1]; j++)
sent_log_prob += log_probs[j];
out.push_back(sent_log_prob);
@@ -157,7 +157,7 @@ int main (int argc, char *argv[])
start.push_back(ngrams.size());
int num_threads = 1;
- vector< vector<double> > sent_log_probs(num_threads);
+ vector< vector<user_data_t> > sent_log_probs(num_threads);
/*
// Test thread safety
@@ -169,7 +169,7 @@ int main (int argc, char *argv[])
*/
score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
- vector<double> log_likelihood(num_threads);
+ vector<user_data_t> log_likelihood(num_threads);
std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
for (int i=0; i<sent_log_probs[0].size(); i++) {
for (int t=0; t<num_threads; t++)
diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp
index 58b9165..87e4a65 100644
--- a/src/testNeuralNetwork.cpp
+++ b/src/testNeuralNetwork.cpp
@@ -98,10 +98,10 @@ int main (int argc, char *argv[])
int num_batches = (test_data_size-1)/myParam.minibatch_size + 1;
cerr<<"Number of test minibatches: "<<num_batches<<endl;
- double log_likelihood = 0.0;
+ user_data_t log_likelihood = 0.0;
- Matrix<double,Dynamic,Dynamic> scores(nn.output_vocab_size, myParam.minibatch_size);
- Matrix<double,Dynamic,Dynamic> output_probs(nn.output_vocab_size, myParam.minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> scores(nn.output_vocab_size, myParam.minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> output_probs(nn.output_vocab_size, myParam.minibatch_size);
for (int batch = 0; batch < num_batches; batch++)
{
@@ -122,7 +122,7 @@ int main (int argc, char *argv[])
// And softmax and loss
- double minibatch_log_likelihood;
+ user_data_t minibatch_log_likelihood;
SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
minibatch.row(myParam.ngram_size-1),
output_probs,
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
index 43e87f1..632391a 100644
--- a/src/trainNeuralNetwork.cpp
+++ b/src/trainNeuralNetwork.cpp
@@ -40,7 +40,7 @@ using namespace boost::random;
using namespace nplm;
namespace ip = boost::interprocess;
-typedef unordered_map<Matrix<int,Dynamic,1>, double> vector_map;
+typedef unordered_map<Matrix<int,Dynamic,1>, user_data_t> vector_map;
typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator;
typedef ip::vector<int, intAllocator> vec;
@@ -50,12 +50,12 @@ typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocato
typedef long long int data_size_t; // training data can easily exceed 2G instances
-void compute_validation_perplexity(int ngram_size, int output_vocab_size, int validation_minibatch_size, int validation_data_size, int num_validation_batches, param & myParam, propagator & prop_validation, Map< Matrix<int,Dynamic,Dynamic> > & validation_data, double & current_learning_rate, double & current_validation_ll)
+void compute_validation_perplexity(int ngram_size, int output_vocab_size, int validation_minibatch_size, int validation_data_size, int num_validation_batches, param & myParam, propagator & prop_validation, Map< Matrix<int,Dynamic,Dynamic> > & validation_data, user_data_t & current_learning_rate, user_data_t & current_validation_ll)
{
- double log_likelihood = 0.0;
+ user_data_t log_likelihood = 0.0;
- Matrix<double,Dynamic,Dynamic> scores(output_vocab_size, validation_minibatch_size);
- Matrix<double,Dynamic,Dynamic> output_probs(output_vocab_size, validation_minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> scores(output_vocab_size, validation_minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> output_probs(output_vocab_size, validation_minibatch_size);
Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, validation_minibatch_size);
for (int validation_batch =0;validation_batch < num_validation_batches;validation_batch++)
@@ -76,7 +76,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
stop_timer(4);
// And softmax and loss. Be careful of short minibatch
- double minibatch_log_likelihood;
+ user_data_t minibatch_log_likelihood;
start_timer(5);
SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
minibatch.row(ngram_size-1),
@@ -113,11 +113,11 @@ int main(int argc, char** argv)
ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
- ValueArg<double> final_momentum("", "final_momentum", "Final value of momentum. Default: 0.9.", false, 0.9, "double", cmd);
- ValueArg<double> initial_momentum("", "initial_momentum", "Initial value of momentum. Default: 0.9.", false, 0.9, "double", cmd);
+ ValueArg<user_data_t> final_momentum("", "final_momentum", "Final value of momentum. Default: 0.9.", false, 0.9, "user_data_t", cmd);
+ ValueArg<user_data_t> initial_momentum("", "initial_momentum", "Initial value of momentum. Default: 0.9.", false, 0.9, "user_data_t", cmd);
ValueArg<bool> use_momentum("", "use_momentum", "Use momentum (hidden layer weights only). 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
- ValueArg<double> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "double", cmd);
+ ValueArg<user_data_t> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "user_data_t", cmd);
ValueArg<bool> normalization("", "normalization", "Learn individual normalization factors during training. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
ValueArg<bool> mmap_file("", "mmap_file", "Use memory mapped files. This is useful if the entire data cannot fit in memory. prepareNeuralLM can generate memory mapped files", false, 0, "bool", cmd);
@@ -126,24 +126,24 @@ int main(int argc, char** argv)
ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 100.", false, 100, "int", cmd);
- ValueArg<double> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "double", cmd);
+ ValueArg<user_data_t> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "user_data_t", cmd);
- ValueArg<double> input_dropout("", "input_dropout", "Probability of retaining input word. Values between 0 (all input is ignored) to 1 (no dropout). Default: 1.", false, 1, "double", cmd);
+ ValueArg<double> input_dropout("", "input_dropout", "Probability of retaining input word. Values between 0 (all input is ignored) to 1 (no dropout). Default: 1.", false, 1, "user_data_t", cmd);
ValueArg<int> null_index("", "null_index", "Index of null word. Used as special (dropped out) token for input dropout.", false, 0, "int", cmd);
- ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 1.", false, 1., "double", cmd);
+ ValueArg<user_data_t> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 1.", false, 1., "user_data_t", cmd);
- ValueArg<double> conditioning_constant("", "conditioning_constant", "Constant to condition the RMS of the expected square of the gradient in ADADELTA. Default: 10E-3.", false, 10E-3, "double", cmd);
+ ValueArg<user_data_t> conditioning_constant("", "conditioning_constant", "Constant to condition the RMS of the expected square of the gradient in ADADELTA. Default: 10E-3.", false, 10E-3, "user_data_t", cmd);
- ValueArg<double> decay("", "decay", "Decay for ADADELTA. Default: 0.95", false, 0.95, "double", cmd);
- ValueArg<double> adagrad_epsilon("", "adagrad_epsilon", "Constant to initialize the L2 squared norm of the gradients with.\
- Default: 10E-3", false, 10E-3, "double", cmd);
+ ValueArg<user_data_t> decay("", "decay", "Decay for ADADELTA. Default: 0.95", false, 0.95, "user_data_t", cmd);
+ ValueArg<user_data_t> adagrad_epsilon("", "adagrad_epsilon", "Constant to initialize the L2 squared norm of the gradients with.\
+ Default: 10E-3", false, 10E-3, "user_data_t", cmd);
ValueArg<int> validation_minibatch_size("", "validation_minibatch_size", "Minibatch size for validation. Default: 64.", false, 64, "int", cmd);
ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 1000.", false, 1000, "int", cmd);
ValueArg<int> num_epochs("", "num_epochs", "Number of epochs. Default: 10.", false, 10, "int", cmd);
- ValueArg<double> init_range("", "init_range", "Maximum (of uniform) or standard deviation (of normal) for initialization. Default: 0.01", false, 0.01, "double", cmd);
+ ValueArg<user_data_t> init_range("", "init_range", "Maximum (of uniform) or standard deviation (of normal) for initialization. Default: 0.01", false, 0.01, "user_data_t", cmd);
ValueArg<bool> init_normal("", "init_normal", "Initialize parameters from a normal distribution. 1 = normal, 0 = uniform. Default: 0.", false, 0, "bool", cmd);
ValueArg<string> loss_function("", "loss_function", "Loss function (log, nce). Default: nce.", false, "nce", "string", cmd);
@@ -553,10 +553,10 @@ int main(int argc, char** argv)
cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
}
- double current_momentum = myParam.initial_momentum;
- double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1);
- double current_learning_rate = myParam.learning_rate;
- double current_validation_ll = 0.0;
+ user_data_t current_momentum = myParam.initial_momentum;
+ user_data_t momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1);
+ user_data_t current_learning_rate = myParam.learning_rate;
+ user_data_t current_validation_ll = 0.0;
int ngram_size = myParam.ngram_size;
int input_vocab_size = myParam.input_vocab_size;
@@ -589,7 +589,7 @@ int main(int argc, char** argv)
cerr << "Training minibatches: ";
- double log_likelihood = 0.0;
+ user_data_t log_likelihood = 0.0;
int num_samples = 0;
if (loss_function == LogLoss)
@@ -597,10 +597,10 @@ int main(int argc, char** argv)
else if (loss_function == NCELoss)
num_samples = 1+num_noise_samples;
- Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
- Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
- Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> scores(num_samples, minibatch_size);
+ Matrix<user_data_t,Dynamic,Dynamic> probs(num_samples, minibatch_size);
for(data_size_t batch=0;batch<num_batches;batch++)
{
@@ -640,7 +640,7 @@ int main(int argc, char** argv)
minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
}
#endif
- double adjusted_learning_rate = current_learning_rate/minibatch_size;
+ user_data_t adjusted_learning_rate = current_learning_rate/minibatch_size;
//cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl;
/*
@@ -701,7 +701,7 @@ int main(int argc, char** argv)
}
}
- double minibatch_log_likelihood;
+ user_data_t minibatch_log_likelihood;
start_timer(5);
softmax_loss.fProp(scores.leftCols(current_minibatch_size),
minibatch_samples,
@@ -747,7 +747,7 @@ int main(int argc, char** argv)
prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
stop_timer(4);
- double minibatch_log_likelihood;
+ user_data_t minibatch_log_likelihood;
start_timer(5);
SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
minibatch.row(ngram_size-1),
diff --git a/src/util.cpp b/src/util.cpp
index f6a5779..825c1f6 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -147,7 +147,7 @@ void readDataFile(const string &filename, int &ngram_size, vector<int> &data, in
DATAIN.close();
}
-double logadd(double x, double y)
+user_data_t logadd(user_data_t x, user_data_t y)
{
if (x > y)
return x + log1p(std::exp(y-x));
diff --git a/src/util.h b/src/util.h
index 04dfa3e..ee9e31c 100644
--- a/src/util.h
+++ b/src/util.h
@@ -20,6 +20,12 @@
#include "maybe_omp.h"
+#ifdef NPLM_DOUBLE_PRECISION
+typedef double user_data_t;
+#else
+typedef float user_data_t;
+#endif
+
// Make matrices hashable
namespace Eigen {
@@ -86,7 +92,7 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra
template <typename Derived>
void initMatrix(boost::random::mt19937 &engine,
const Eigen::MatrixBase<Derived> &p_const,
- bool init_normal, double range)
+ bool init_normal, user_data_t range)
{
UNCONST(Derived, p_const, p);
if (init_normal == 0)
@@ -105,7 +111,7 @@ void initMatrix(boost::random::mt19937 &engine,
else
// initialize with gaussian distribution with mean 0 and stdev range
{
- boost::random::normal_distribution<double> unif_normal(0., range);
+ boost::random::normal_distribution<user_data_t> unif_normal(0., range);
for (int i = 0; i < p.rows(); i++)
{
for (int j = 0; j < p.cols(); j++)
@@ -119,7 +125,7 @@ void initMatrix(boost::random::mt19937 &engine,
template <typename Derived>
void initBias(boost::random::mt19937 &engine,
const Eigen::MatrixBase<Derived> &p_const,
- bool init_normal, double range)
+ bool init_normal, user_data_t range)
{
UNCONST(Derived, p_const, p);
if (init_normal == 0)
@@ -135,7 +141,7 @@ void initBias(boost::random::mt19937 &engine,
else
// initialize with gaussian distribution with mean 0 and stdev range
{
- boost::random::normal_distribution<double> unif_normal(0., range);
+ boost::random::normal_distribution<user_data_t> unif_normal(0., range);
for (int i = 0; i < p.size(); i++)
{
p(i) = unif_normal(engine);
@@ -234,11 +240,11 @@ void writeMatrix(const Eigen::MatrixBase<Derived> &param, std::ofstream &OUT)
}
template <typename Derived>
-double logsum(const Eigen::MatrixBase<Derived> &v)
+user_data_t logsum(const Eigen::MatrixBase<Derived> &v)
{
int mi;
- double m = v.maxCoeff(&mi);
- double logz = 0.0;
+ user_data_t m = v.maxCoeff(&mi);
+ user_data_t logz = 0.0;
for (int i=0; i<v.rows(); i++)
if (i != mi)
logz += std::exp(v(i) - m);
@@ -246,7 +252,7 @@ double logsum(const Eigen::MatrixBase<Derived> &v)
return logz;
}
-double logadd(double x, double y);
+user_data_t logadd(user_data_t x, user_data_t y);
#ifdef USE_CHRONO
class Timer