Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2014-11-17 14:36:36 +0300
committerRico Sennrich <rico.sennrich@gmx.ch>2014-11-17 14:48:51 +0300
commite34889509fa285cde357515469d9eb3c08c4c9f7 (patch)
treee4434684f99b18f0bef302882e9546e090340085
parentd58f43085e9fc6399baa8f794d872864c9697bfa (diff)
re-apply 64120 (support for networks with single hidden layer)
-rw-r--r--src/model.cpp27
-rw-r--r--src/neuralNetwork.h20
-rw-r--r--src/propagator.h51
-rw-r--r--src/testNeuralNetwork.cpp6
-rw-r--r--src/trainNeuralNetwork.cpp16
5 files changed, 97 insertions, 23 deletions
diff --git a/src/model.cpp b/src/model.cpp
index 262490f..3767f4b 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -20,10 +20,20 @@ void model::resize(int ngram_size,
int output_embedding_dimension)
{
input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
- first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
- first_hidden_activation.resize(num_hidden);
- second_hidden_linear.resize(output_embedding_dimension, num_hidden);
- second_hidden_activation.resize(output_embedding_dimension);
+ if (num_hidden == 0)
+ {
+ first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
+ first_hidden_activation.resize(output_embedding_dimension);
+ second_hidden_linear.resize(1,1);
+ second_hidden_activation.resize(1);
+ }
+ else
+ {
+ first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+ first_hidden_activation.resize(num_hidden);
+ second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+ second_hidden_activation.resize(output_embedding_dimension);
+ }
output_layer.resize(output_vocab_size, output_embedding_dimension);
this->ngram_size = ngram_size;
this->input_vocab_size = input_vocab_size;
@@ -70,7 +80,14 @@ void model::premultiply()
// we can multiply them into a single linear layer *if* we are not training
int context_size = ngram_size-1;
Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
- first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+ if (num_hidden == 0)
+ {
+ first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
+ }
+ else
+ {
+ first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+ }
for (int i=0; i<context_size; i++)
first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
input_layer.W->resize(1,1); // try to save some memory
diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h
index 84b0a79..fd451f1 100644
--- a/src/neuralNetwork.h
+++ b/src/neuralNetwork.h
@@ -96,13 +96,19 @@ public:
if (normalization)
{
Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
double logz = logsum(scores.col(0));
log_prob = weight * (scores(output, 0) - logz);
}
else
{
- log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
+ if (prop.skip_hidden)
+ log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
+ else
+ log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
}
stop_timer(3);
@@ -135,7 +141,10 @@ public:
if (normalization)
{
Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
// And softmax and loss
Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
@@ -152,7 +161,10 @@ public:
for (int j=0; j<ngram.cols(); j++)
{
int output = ngram(m->ngram_size-1, j);
- log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
+ if (prop.skip_hidden)
+ log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
+ else
+ log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
}
}
}
diff --git a/src/propagator.h b/src/propagator.h
index df8a7c2..9f214de 100644
--- a/src/propagator.h
+++ b/src/propagator.h
@@ -23,6 +23,7 @@ public:
Node<Linear_layer> second_hidden_linear_node;
Node<Activation_function> second_hidden_activation_node;
Node<Output_word_embeddings> output_layer_node;
+ bool skip_hidden;
public:
propagator () : minibatch_size(0), pnn(0) { }
@@ -38,6 +39,7 @@ public:
output_layer_node(&nn.output_layer, minibatch_size),
minibatch_size(minibatch_size)
{
+ skip_hidden = (nn.num_hidden == 0);
}
// This must be called if the underlying model is resized.
@@ -83,12 +85,14 @@ public:
stop_timer(1);
+ if (!skip_hidden) {
start_timer(2);
second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
second_hidden_linear_node.fProp_matrix);
second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
second_hidden_activation_node.fProp_matrix);
stop_timer(2);
+ }
// The propagation stops here because the last layer is very expensive.
}
@@ -112,19 +116,20 @@ public:
stop_timer(7);
start_timer(8);
+ Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
if (parameter_update == "SGD") {
- output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
+ output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
output,
learning_rate,
momentum);
} else if (parameter_update == "ADA") {
- output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix,
+ output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
output,
learning_rate);
} else if (parameter_update == "ADAD") {
//std::cerr<<"Adadelta gradient"<<endl;
- int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols();
- output_layer_node.param->computeGradientAdadelta(second_hidden_activation_node.fProp_matrix,
+ int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+ output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
output,
1.0/current_minibatch_size,
conditioning_constant,
@@ -166,21 +171,22 @@ public:
start_timer(8);
+ Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
if (parameter_update == "SGD") {
- output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
+ output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
samples,
weights,
learning_rate,
momentum);
} else if (parameter_update == "ADA") {
- output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix,
+ output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
samples,
weights,
learning_rate);
} else if (parameter_update == "ADAD") {
- int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols();
+ int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
//std::cerr<<"Adadelta gradient"<<endl;
- output_layer_node.param->computeGradientAdadelta(second_hidden_activation_node.fProp_matrix,
+ output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
samples,
weights,
1.0/current_minibatch_size,
@@ -217,7 +223,22 @@ private:
// functions are together
////////BACKPROP////////////
start_timer(9);
- second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+ if (skip_hidden)
+ {
+ start_timer(9);
+ first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+ first_hidden_activation_node.bProp_matrix,
+ first_hidden_linear_node.fProp_matrix,
+ first_hidden_activation_node.fProp_matrix);
+
+ first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+ first_hidden_linear_node.bProp_matrix);
+ stop_timer(9);
+
+ }
+ else
+ {
+ second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
second_hidden_activation_node.bProp_matrix,
second_hidden_linear_node.fProp_matrix,
second_hidden_activation_node.fProp_matrix);
@@ -233,13 +254,16 @@ private:
first_hidden_linear_node.fProp_matrix,
first_hidden_activation_node.fProp_matrix);
- first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+ first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
first_hidden_linear_node.bProp_matrix);
stop_timer(11);
+ }
//std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
//std::getchar();
////COMPUTE GRADIENT/////////
if (parameter_update == "SGD") {
+ if (!skip_hidden)
+ {
start_timer(10);
second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
first_hidden_activation_node.fProp_matrix,
@@ -247,6 +271,7 @@ private:
momentum,
L2_reg);
stop_timer(10);
+ }
// First hidden layer
@@ -265,12 +290,15 @@ private:
learning_rate, momentum, L2_reg);
stop_timer(13);
} else if (parameter_update == "ADA") {
+ if (!skip_hidden)
+ {
start_timer(10);
second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
first_hidden_activation_node.fProp_matrix,
learning_rate,
L2_reg);
stop_timer(10);
+ }
// First hidden layer
@@ -293,6 +321,8 @@ private:
} else if (parameter_update == "ADAD") {
int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
//std::cerr<<"Adadelta gradient"<<endl;
+ if (!skip_hidden)
+ {
start_timer(10);
second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
first_hidden_activation_node.fProp_matrix,
@@ -301,6 +331,7 @@ private:
conditioning_constant,
decay);
stop_timer(10);
+ }
//std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
// First hidden layer
diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp
index 3e1640e..c60f9c5 100644
--- a/src/testNeuralNetwork.cpp
+++ b/src/testNeuralNetwork.cpp
@@ -101,7 +101,11 @@ int main (int argc, char *argv[])
prop.fProp(minibatch.topRows(myParam.ngram_size-1));
// Do full forward prop through output word embedding layer
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
// And softmax and loss
double minibatch_log_likelihood;
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
index 9a4b486..21a70d6 100644
--- a/src/trainNeuralNetwork.cpp
+++ b/src/trainNeuralNetwork.cpp
@@ -609,7 +609,11 @@ int main(int argc, char** argv)
// Final forward propagation step (sparse)
start_timer(4);
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix,
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix,
+ minibatch_samples, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix,
minibatch_samples, scores);
stop_timer(4);
@@ -663,7 +667,10 @@ int main(int argc, char** argv)
{
///// Standard log-likelihood
start_timer(4);
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
stop_timer(4);
double minibatch_log_likelihood;
@@ -743,7 +750,10 @@ int main(int argc, char** argv)
// Do full forward prop through output word embedding layer
start_timer(4);
- prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores);
+ if (prop_validation.skip_hidden)
+ prop_validation.output_layer_node.param->fProp(prop_validation.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores);
stop_timer(4);
// And softmax and loss. Be careful of short minibatch