From e34889509fa285cde357515469d9eb3c08c4c9f7 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Mon, 17 Nov 2014 11:36:36 +0000 Subject: re-apply 64120 (support for networks with single hidden layer) --- src/model.cpp | 27 +++++++++++++++++++----- src/neuralNetwork.h | 20 ++++++++++++++---- src/propagator.h | 51 +++++++++++++++++++++++++++++++++++++--------- src/testNeuralNetwork.cpp | 6 +++++- src/trainNeuralNetwork.cpp | 16 ++++++++++++--- 5 files changed, 97 insertions(+), 23 deletions(-) diff --git a/src/model.cpp b/src/model.cpp index 262490f..3767f4b 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -20,10 +20,20 @@ void model::resize(int ngram_size, int output_embedding_dimension) { input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); - first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(num_hidden); - second_hidden_linear.resize(output_embedding_dimension, num_hidden); - second_hidden_activation.resize(output_embedding_dimension); + if (num_hidden == 0) + { + first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(output_embedding_dimension); + second_hidden_linear.resize(1,1); + second_hidden_activation.resize(1); + } + else + { + first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(num_hidden); + second_hidden_linear.resize(output_embedding_dimension, num_hidden); + second_hidden_activation.resize(output_embedding_dimension); + } output_layer.resize(output_vocab_size, output_embedding_dimension); this->ngram_size = ngram_size; this->input_vocab_size = input_vocab_size; @@ -70,7 +80,14 @@ void model::premultiply() // we can multiply them into a single linear layer *if* we are not training int context_size = ngram_size-1; Matrix U = first_hidden_linear.U; - first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + if (num_hidden == 0) + { + first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); + } + else + { + first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + } for (int i=0; itranspose(); input_layer.W->resize(1,1); // try to save some memory diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h index 84b0a79..fd451f1 100644 --- a/src/neuralNetwork.h +++ b/src/neuralNetwork.h @@ -96,13 +96,19 @@ public: if (normalization) { Eigen::Matrix scores(m->output_vocab_size); - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); double logz = logsum(scores.col(0)); log_prob = weight * (scores(output, 0) - logz); } else { - log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); + if (prop.skip_hidden) + log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); + else + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); } stop_timer(3); @@ -135,7 +141,10 @@ public: if (normalization) { Eigen::Matrix scores(m->output_vocab_size, ngram.cols()); - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); // And softmax and loss Matrix output_probs(m->output_vocab_size, ngram.cols()); @@ -152,7 +161,10 @@ public: for (int j=0; jngram_size-1, j); - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + if (prop.skip_hidden) + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); + else + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); } } } diff --git a/src/propagator.h b/src/propagator.h index df8a7c2..9f214de 100644 --- a/src/propagator.h +++ b/src/propagator.h @@ -23,6 +23,7 @@ public: Node second_hidden_linear_node; Node second_hidden_activation_node; Node output_layer_node; + bool skip_hidden; public: propagator () : minibatch_size(0), pnn(0) { } @@ -38,6 +39,7 @@ public: output_layer_node(&nn.output_layer, minibatch_size), minibatch_size(minibatch_size) { + skip_hidden = (nn.num_hidden == 0); } // This must be called if the underlying model is resized. @@ -83,12 +85,14 @@ public: stop_timer(1); + if (!skip_hidden) { start_timer(2); second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, second_hidden_linear_node.fProp_matrix); second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, second_hidden_activation_node.fProp_matrix); stop_timer(2); + } // The propagation stops here because the last layer is very expensive. } @@ -112,19 +116,20 @@ public: stop_timer(7); start_timer(8); + Node & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, output, learning_rate, momentum); } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix, + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, output, learning_rate); } else if (parameter_update == "ADAD") { //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(second_hidden_activation_node.fProp_matrix, + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); + output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, output, 1.0/current_minibatch_size, conditioning_constant, @@ -166,21 +171,22 @@ public: start_timer(8); + Node & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node; if (parameter_update == "SGD") { - output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix, samples, weights, learning_rate, momentum); } else if (parameter_update == "ADA") { - output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix, + output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix, samples, weights, learning_rate); } else if (parameter_update == "ADAD") { - int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols(); + int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols(); //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(second_hidden_activation_node.fProp_matrix, + output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix, samples, weights, 1.0/current_minibatch_size, @@ -217,7 +223,22 @@ private: // functions are together ////////BACKPROP//////////// start_timer(9); - second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + if (skip_hidden) + { + start_timer(9); + first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(9); + + } + else + { + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, second_hidden_activation_node.bProp_matrix, second_hidden_linear_node.fProp_matrix, second_hidden_activation_node.fProp_matrix); @@ -233,13 +254,16 @@ private: first_hidden_linear_node.fProp_matrix, first_hidden_activation_node.fProp_matrix); - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, first_hidden_linear_node.bProp_matrix); stop_timer(11); + } //std::cerr<<"First hidden layer node backprop matrix is"<computeGradient(second_hidden_activation_node.bProp_matrix, first_hidden_activation_node.fProp_matrix, @@ -247,6 +271,7 @@ private: momentum, L2_reg); stop_timer(10); + } // First hidden layer @@ -265,12 +290,15 @@ private: learning_rate, momentum, L2_reg); stop_timer(13); } else if (parameter_update == "ADA") { + if (!skip_hidden) + { start_timer(10); second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix, first_hidden_activation_node.fProp_matrix, learning_rate, L2_reg); stop_timer(10); + } // First hidden layer @@ -293,6 +321,8 @@ private: } else if (parameter_update == "ADAD") { int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols(); //std::cerr<<"Adadelta gradient"<computeGradientAdadelta(second_hidden_activation_node.bProp_matrix, first_hidden_activation_node.fProp_matrix, @@ -301,6 +331,7 @@ private: conditioning_constant, decay); stop_timer(10); + } //std::cerr<<"Finished gradient for second hidden linear layer"<