diff options
author | Rico Sennrich <rico.sennrich@gmx.ch> | 2014-10-16 17:17:28 +0400 |
---|---|---|
committer | Rico Sennrich <rico.sennrich@gmx.ch> | 2014-10-16 17:22:34 +0400 |
commit | 641207503691f5fda73492b2203ac51be2e05307 (patch) | |
tree | fb2bbf6edf17690116441aeea9541c18cc204ec1 | |
parent | 84b1e84e87626dfb1382f0bafcdbe55733f87922 (diff) |
support for networks with single hidden layer
(set num_hidden to 0; num_output_embeddings will be size of remaining layer)
-rw-r--r-- | src/model.cpp | 23 | ||||
-rw-r--r-- | src/neuralLM.h | 20 | ||||
-rw-r--r-- | src/propagator.h | 91 | ||||
-rw-r--r-- | src/testNeuralNetwork.cpp | 5 | ||||
-rw-r--r-- | src/trainNeuralNetwork.cpp | 16 |
5 files changed, 112 insertions, 43 deletions
diff --git a/src/model.cpp b/src/model.cpp index 3611975..589a52e 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -20,10 +20,18 @@ namespace nplm int output_embedding_dimension) { input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1); - first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); - first_hidden_activation.resize(num_hidden); - second_hidden_linear.resize(output_embedding_dimension, num_hidden); - second_hidden_activation.resize(output_embedding_dimension); + if (num_hidden == 0) { + first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(output_embedding_dimension); + second_hidden_linear.resize(1,1); + second_hidden_activation.resize(1); + } + else { + first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1)); + first_hidden_activation.resize(num_hidden); + second_hidden_linear.resize(output_embedding_dimension, num_hidden); + second_hidden_activation.resize(output_embedding_dimension); + } output_layer.resize(output_vocab_size, output_embedding_dimension); this->ngram_size = ngram_size; this->input_vocab_size = input_vocab_size; @@ -48,7 +56,12 @@ void model::premultiply() // we can multiply them into a single linear layer *if* we are not training int context_size = ngram_size-1; Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U; - first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + if (num_hidden == 0) { + first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size); + } + else { + first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size); + } for (int i=0; i<context_size; i++) first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose(); input_layer.W->resize(1,1); // try to save some memory diff --git a/src/neuralLM.h b/src/neuralLM.h index 0b8b72f..f451e8a 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -177,13 +177,19 @@ public: if (normalization) { Eigen::Matrix<double,Eigen::Dynamic,1> scores(shared->output_vocab.size()); - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); double logz = logsum(scores.col(0)); log_prob = weight * (scores(output, 0) - logz); } else { - log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); + if (prop.skip_hidden) + log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0); + else + log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0); } stop_timer(3); @@ -216,7 +222,10 @@ public: if (normalization) { Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(shared->output_vocab.size(), ngram.cols()); - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); // And softmax and loss Matrix<double,Dynamic,Dynamic> output_probs(shared->nn.output_vocab_size, ngram.cols()); @@ -233,7 +242,10 @@ public: for (int j=0; j<ngram.cols(); j++) { int output = ngram(ngram_size-1, j); - log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); + if (prop.skip_hidden) + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j); + else + log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j); } } } diff --git a/src/propagator.h b/src/propagator.h index 0619de6..c52a6a9 100644 --- a/src/propagator.h +++ b/src/propagator.h @@ -23,6 +23,7 @@ public: Node<Linear_layer> second_hidden_linear_node; Node<Activation_function> second_hidden_activation_node; Node<Output_word_embeddings> output_layer_node; + bool skip_hidden; public: propagator () : minibatch_size(0), pnn(0) { } @@ -39,6 +40,7 @@ public: output_layer_node(const_cast<Output_word_embeddings*>(&nn.output_layer), minibatch_size), minibatch_size(minibatch_size) { + skip_hidden = (nn.num_hidden == 0); } // This must be called if the underlying model is resized. @@ -82,12 +84,14 @@ public: stop_timer(1); + if (!skip_hidden) { start_timer(2); second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix, second_hidden_linear_node.fProp_matrix); second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix, second_hidden_activation_node.fProp_matrix); stop_timer(2); + } // The propagation stops here because the last layer is very expensive. } @@ -106,9 +110,16 @@ public: stop_timer(7); start_timer(8); + if (skip_hidden) { + output_layer_node.param->computeGradient(first_hidden_activation_node.fProp_matrix, + output, + learning_rate, momentum); + } + else { output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, output, learning_rate, momentum); + } stop_timer(8); bPropRest(data, learning_rate, momentum, L2_reg); @@ -130,9 +141,16 @@ public: start_timer(8); - output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, - samples, weights, - learning_rate, momentum); + if (skip_hidden) { + output_layer_node.param->computeGradient(first_hidden_activation_node.fProp_matrix, + samples, weights, + learning_rate, momentum); + } + else { + output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix, + samples, weights, + learning_rate, momentum); + } stop_timer(8); bPropRest(data, learning_rate, momentum, L2_reg); @@ -145,33 +163,46 @@ private: { // Second hidden layer - start_timer(9); - second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, - second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.fProp_matrix, - second_hidden_activation_node.fProp_matrix); - - second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, - second_hidden_linear_node.bProp_matrix); - stop_timer(9); - - start_timer(10); - second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, - first_hidden_activation_node.fProp_matrix, - learning_rate, momentum, L2_reg); - stop_timer(10); - - // First hidden layer - - start_timer(11); - first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, - first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.fProp_matrix, - first_hidden_activation_node.fProp_matrix); - - first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, - first_hidden_linear_node.bProp_matrix); - stop_timer(11); + if (skip_hidden) { + start_timer(9); + first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(9); + } + else { + start_timer(9); + second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix, + second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.fProp_matrix, + second_hidden_activation_node.fProp_matrix); + + second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix, + second_hidden_linear_node.bProp_matrix); + stop_timer(9); + + start_timer(10); + second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix, + first_hidden_activation_node.fProp_matrix, + learning_rate, momentum, L2_reg); + stop_timer(10); + + // First hidden layer + + start_timer(11); + first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix, + first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.fProp_matrix, + first_hidden_activation_node.fProp_matrix); + + first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix, + first_hidden_linear_node.bProp_matrix); + stop_timer(11); + } start_timer(12); first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix, diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp index f20fff9..1b4820e 100644 --- a/src/testNeuralNetwork.cpp +++ b/src/testNeuralNetwork.cpp @@ -101,7 +101,10 @@ int main (int argc, char *argv[]) prop.fProp(minibatch.topRows(myParam.ngram_size-1)); // Do full forward prop through output word embedding layer - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); // And softmax and loss double minibatch_log_likelihood; diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp index a088e16..57323d9 100644 --- a/src/trainNeuralNetwork.cpp +++ b/src/trainNeuralNetwork.cpp @@ -442,7 +442,11 @@ int main(int argc, char** argv) // Final forward propagation step (sparse) start_timer(4); - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, + minibatch_samples, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, minibatch_samples, scores); stop_timer(4); @@ -491,7 +495,10 @@ int main(int argc, char** argv) { ///// Standard log-likelihood start_timer(4); - prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); + if (prop.skip_hidden) + prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores); + else + prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores); stop_timer(4); double minibatch_log_likelihood; @@ -566,7 +573,10 @@ int main(int argc, char** argv) // Do full forward prop through output word embedding layer start_timer(4); - prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores); + if (prop_validation.skip_hidden) + prop_validation.output_layer_node.param->fProp(prop_validation.first_hidden_activation_node.fProp_matrix, scores); + else + prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores); stop_timer(4); // And softmax and loss. Be careful of short minibatch |