tab fix

author: graehl <graehl@gmail.com> 2015-06-25 21:38:54 +0300
committer: graehl <graehl@gmail.com> 2015-06-25 22:13:53 +0300
commit: 363c73cacf94d965a8759ae8b55f56d8c1c29bb1 (patch)
tree: dd281ad350c508d766327988152a900acc851b95
parent: 5fbf9611d24b6fd80c5839fe547f1edb141fa162 (diff)
3 files changed, 1053 insertions, 1052 deletions
diff --git a/.gitignore b/.gitignore
index 12fab12..2843613 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ src/testNeuralLM
 src/testNeuralNetwork
 src/trainNeuralNetwork
 .history
+src/make.sh
diff --git a/src/model.cpp b/src/model.cpp
index 919e005..db7f006 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -13,295 +13,295 @@ namespace nplm
 {
 
 void model::resize(int ngram_size,
-    int input_vocab_size,
-    int output_vocab_size,
-    int input_embedding_dimension,
-    int num_hidden,
-    int output_embedding_dimension)
+                   int input_vocab_size,
+                   int output_vocab_size,
+                   int input_embedding_dimension,
+                   int num_hidden,
+                   int output_embedding_dimension)
 {
-    input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
-    if (num_hidden == 0)
-    {
-        first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
-        first_hidden_activation.resize(output_embedding_dimension);
-        second_hidden_linear.resize(1,1);
-        second_hidden_activation.resize(1);
-    }
-    else
-    {
-        first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
-        first_hidden_activation.resize(num_hidden);
-        second_hidden_linear.resize(output_embedding_dimension, num_hidden);
-        second_hidden_activation.resize(output_embedding_dimension);
-    }
-    output_layer.resize(output_vocab_size, output_embedding_dimension);
-    this->ngram_size = ngram_size;
-    this->input_vocab_size = input_vocab_size;
-    this->output_vocab_size = output_vocab_size;
-    this->input_embedding_dimension = input_embedding_dimension;
-    this->num_hidden = num_hidden;
-    this->output_embedding_dimension = output_embedding_dimension;
-    premultiplied = false;
+  input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
+  if (num_hidden == 0)
+  {
+    first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
+    first_hidden_activation.resize(output_embedding_dimension);
+    second_hidden_linear.resize(1,1);
+    second_hidden_activation.resize(1);
+  }
+  else
+  {
+    first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+    first_hidden_activation.resize(num_hidden);
+    second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+    second_hidden_activation.resize(output_embedding_dimension);
+  }
+  output_layer.resize(output_vocab_size, output_embedding_dimension);
+  this->ngram_size = ngram_size;
+  this->input_vocab_size = input_vocab_size;
+  this->output_vocab_size = output_vocab_size;
+  this->input_embedding_dimension = input_embedding_dimension;
+  this->num_hidden = num_hidden;
+  this->output_embedding_dimension = output_embedding_dimension;
+  premultiplied = false;
 }
-  
+
 void model::initialize(boost::random::mt19937 &init_engine,
-    bool init_normal,
-    double init_range,
-    double init_bias,
-    string &parameter_update,
-    double adagrad_epsilon)
+                       bool init_normal,
+                       double init_range,
+                       double init_bias,
+                       string &parameter_update,
+                       double adagrad_epsilon)
 {
-    input_layer.initialize(init_engine,
-        init_normal,
-        init_range,
-        parameter_update,
-        adagrad_epsilon);
-    output_layer.initialize(init_engine,
-        init_normal,
-        init_range,
-        init_bias,
-        parameter_update,
-        adagrad_epsilon);
-    first_hidden_linear.initialize(init_engine,
-        init_normal,
-        init_range,
-        parameter_update,
-        adagrad_epsilon);
-    second_hidden_linear.initialize(init_engine,
-        init_normal,
-        init_range,
-        parameter_update,
-        adagrad_epsilon);
+  input_layer.initialize(init_engine,
+                         init_normal,
+                         init_range,
+                         parameter_update,
+                         adagrad_epsilon);
+  output_layer.initialize(init_engine,
+                          init_normal,
+                          init_range,
+                          init_bias,
+                          parameter_update,
+                          adagrad_epsilon);
+  first_hidden_linear.initialize(init_engine,
+                                 init_normal,
+                                 init_range,
+                                 parameter_update,
+                                 adagrad_epsilon);
+  second_hidden_linear.initialize(init_engine,
+                                  init_normal,
+                                  init_range,
+                                  parameter_update,
+                                  adagrad_epsilon);
 }
 
 void model::premultiply()
 {
-    // Since input and first_hidden_linear are both linear,
-    // we can multiply them into a single linear layer *if* we are not training
-    int context_size = ngram_size-1;
-    Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
-    if (num_hidden == 0)
-    {
-        first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
-    }
-    else
-    {
-        first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
-    }
-    for (int i=0; i<context_size; i++)
-        first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
-    input_layer.W->resize(1,1); // try to save some memory
-    premultiplied = true;
+  // Since input and first_hidden_linear are both linear,
+  // we can multiply them into a single linear layer *if* we are not training
+  int context_size = ngram_size-1;
+  Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
+  if (num_hidden == 0)
+  {
+    first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
+  }
+  else
+  {
+    first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+  }
+  for (int i=0; i<context_size; i++)
+    first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
+  input_layer.W->resize(1,1); // try to save some memory
+  premultiplied = true;
 }
 
 void model::readConfig(ifstream &config_file)
 {
-    string line;
-    vector<string> fields;
-    int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
-    activation_function_type activation_function = this->activation_function;
-    while (getline(config_file, line) && line != "")
+  string line;
+  vector<string> fields;
+  int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
+  activation_function_type activation_function = this->activation_function;
+  while (getline(config_file, line) && line != "")
+  {
+    splitBySpace(line, fields);
+    if (fields[0] == "ngram_size")
+      ngram_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "vocab_size")
+      input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "input_vocab_size")
+      input_vocab_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "output_vocab_size")
+      output_vocab_size = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "input_embedding_dimension")
+      input_embedding_dimension = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "num_hidden")
+      num_hidden = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "output_embedding_dimension")
+      output_embedding_dimension = lexical_cast<int>(fields[1]);
+    else if (fields[0] == "activation_function")
+      activation_function = string_to_activation_function(fields[1]);
+    else if (fields[0] == "version")
     {
-        splitBySpace(line, fields);
-	if (fields[0] == "ngram_size")
-	    ngram_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "vocab_size")
-	    input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "input_vocab_size")
-	    input_vocab_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "output_vocab_size")
-	    output_vocab_size = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "input_embedding_dimension")
-	    input_embedding_dimension = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "num_hidden")
-	    num_hidden = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "output_embedding_dimension")
-	    output_embedding_dimension = lexical_cast<int>(fields[1]);
-	else if (fields[0] == "activation_function")
-	    activation_function = string_to_activation_function(fields[1]);
-	else if (fields[0] == "version")
-	{
-	    int version = lexical_cast<int>(fields[1]);
-	    if (version != 1)
-	    {
-		cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
-		exit(1);
-	    }
-	}
-	else
-	    cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+      int version = lexical_cast<int>(fields[1]);
+      if (version != 1)
+      {
+        cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
+        exit(1);
+      }
     }
-    resize(ngram_size,
-        input_vocab_size,
-        output_vocab_size,
-        input_embedding_dimension,
-        num_hidden,
-        output_embedding_dimension);
-    set_activation_function(activation_function);
+    else
+      cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+  }
+  resize(ngram_size,
+         input_vocab_size,
+         output_vocab_size,
+         input_embedding_dimension,
+         num_hidden,
+         output_embedding_dimension);
+  set_activation_function(activation_function);
 }
 
 void model::readConfig(const string &filename)
 {
-    ifstream config_file(filename.c_str());
-    if (!config_file)
-    {
-        cerr << "error: could not open config file " << filename << endl;
-	exit(1);
-    }
-    readConfig(config_file);
-    config_file.close();
+  ifstream config_file(filename.c_str());
+  if (!config_file)
+  {
+    cerr << "error: could not open config file " << filename << endl;
+    exit(1);
+  }
+  readConfig(config_file);
+  config_file.close();
 }
- 
+
 void model::read(const string &filename)
 {
-    vector<string> input_words;
-    vector<string> output_words;
-    read(filename, input_words, output_words);
+  vector<string> input_words;
+  vector<string> output_words;
+  read(filename, input_words, output_words);
 }
 
 void model::read(const string &filename, vector<string> &words)
 {
-    vector<string> output_words;
-    read(filename, words, output_words);
+  vector<string> output_words;
+  read(filename, words, output_words);
 }
 
 void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words)
 {
-    ifstream file(filename.c_str());
-    if (!file) throw runtime_error("Could not open file " + filename);
-    
-    param myParam;
-    string line;
-    
-    while (getline(file, line))
+  ifstream file(filename.c_str());
+  if (!file) throw runtime_error("Could not open file " + filename);
+
+  param myParam;
+  string line;
+
+  while (getline(file, line))
+  {
+    if (line == "\\config")
+    {
+      readConfig(file);
+    }
+
+    else if (line == "\\vocab")
+    {
+      input_words.clear();
+      readWordsFile(file, input_words);
+      output_words = input_words;
+    }
+
+    else if (line == "\\input_vocab")
     {
-	if (line == "\\config")
-	{
-	    readConfig(file);
-	}
-
-	else if (line == "\\vocab")
-	{
-	    input_words.clear();
-	    readWordsFile(file, input_words);
-	    output_words = input_words;
-	}
-
-	else if (line == "\\input_vocab")
-	{
-	    input_words.clear();
-	    readWordsFile(file, input_words);
-	}
-
-	else if (line == "\\output_vocab")
-	{
-	    output_words.clear();
-	    readWordsFile(file, output_words);
-	}
-
-	else if (line == "\\input_embeddings")
-	    input_layer.read(file);
-	else if (line == "\\hidden_weights 1")
-	    first_hidden_linear.read_weights(file);
-	else if (line == "\\hidden_biases 1")
-	    first_hidden_linear.read_biases (file);
-	else if (line == "\\hidden_weights 2")
-	    second_hidden_linear.read_weights(file);
-	else if (line == "\\hidden_biases 2")
-	    second_hidden_linear.read_biases (file);
-	else if (line == "\\output_weights")
-	    output_layer.read_weights(file);
-	else if (line == "\\output_biases")
-	    output_layer.read_biases(file);
-	else if (line == "\\end")
-	    break;
-	else if (line == "")
-	    continue;
-	else
-	{
-	    cerr << "warning: unrecognized section: " << line << endl;
-	    // skip over section
-	    while (getline(file, line) && line != "") { }
-	}
+      input_words.clear();
+      readWordsFile(file, input_words);
     }
-    file.close();
+
+    else if (line == "\\output_vocab")
+    {
+      output_words.clear();
+      readWordsFile(file, output_words);
+    }
+
+    else if (line == "\\input_embeddings")
+      input_layer.read(file);
+    else if (line == "\\hidden_weights 1")
+      first_hidden_linear.read_weights(file);
+    else if (line == "\\hidden_biases 1")
+      first_hidden_linear.read_biases (file);
+    else if (line == "\\hidden_weights 2")
+      second_hidden_linear.read_weights(file);
+    else if (line == "\\hidden_biases 2")
+      second_hidden_linear.read_biases (file);
+    else if (line == "\\output_weights")
+      output_layer.read_weights(file);
+    else if (line == "\\output_biases")
+      output_layer.read_biases(file);
+    else if (line == "\\end")
+      break;
+    else if (line == "")
+      continue;
+    else
+    {
+      cerr << "warning: unrecognized section: " << line << endl;
+      // skip over section
+      while (getline(file, line) && line != "") { }
+    }
+  }
+  file.close();
 }
 
 void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
-{ 
-    write(filename, &input_words, &output_words);
+{
+  write(filename, &input_words, &output_words);
 }
 
 void model::write(const string &filename, const vector<string> &words)
-{ 
-    write(filename, &words, NULL);
+{
+  write(filename, &words, NULL);
 }
 
-void model::write(const string &filename) 
-{ 
-    write(filename, NULL, NULL);
+void model::write(const string &filename)
+{
+  write(filename, NULL, NULL);
 }
 
 void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
 {
-    ofstream file(filename.c_str());
-    if (!file) throw runtime_error("Could not open file " + filename);
-    
-    file << "\\config" << endl;
-    file << "version 1" << endl;
-    file << "ngram_size " << ngram_size << endl;
-    file << "input_vocab_size " << input_vocab_size << endl;
-    file << "output_vocab_size " << output_vocab_size << endl;
-    file << "input_embedding_dimension " << input_embedding_dimension << endl;
-    file << "num_hidden " << num_hidden << endl;
-    file << "output_embedding_dimension " << output_embedding_dimension << endl;
-    file << "activation_function " << activation_function_to_string(activation_function) << endl;
-    file << endl;
-    
-    if (input_pwords)
-    {
-        file << "\\input_vocab" << endl;
-	writeWordsFile(*input_pwords, file);
-	file << endl;
-    }
+  ofstream file(filename.c_str());
+  if (!file) throw runtime_error("Could not open file " + filename);
 
-    if (output_pwords)
-    {
-        file << "\\output_vocab" << endl;
-	writeWordsFile(*output_pwords, file);
-	file << endl;
-    }
+  file << "\\config" << endl;
+  file << "version 1" << endl;
+  file << "ngram_size " << ngram_size << endl;
+  file << "input_vocab_size " << input_vocab_size << endl;
+  file << "output_vocab_size " << output_vocab_size << endl;
+  file << "input_embedding_dimension " << input_embedding_dimension << endl;
+  file << "num_hidden " << num_hidden << endl;
+  file << "output_embedding_dimension " << output_embedding_dimension << endl;
+  file << "activation_function " << activation_function_to_string(activation_function) << endl;
+  file << endl;
 
-    file << "\\input_embeddings" << endl;
-    input_layer.write(file);
-    file << endl;
-    
-    file << "\\hidden_weights 1" << endl;
-    first_hidden_linear.write_weights(file);
+  if (input_pwords)
+  {
+    file << "\\input_vocab" << endl;
+    writeWordsFile(*input_pwords, file);
     file << endl;
+  }
 
-    file << "\\hidden_biases 1" << endl;
-    first_hidden_linear.write_biases(file);
-    file <<endl;
-    
-    file << "\\hidden_weights 2" << endl;
-    second_hidden_linear.write_weights(file);
+  if (output_pwords)
+  {
+    file << "\\output_vocab" << endl;
+    writeWordsFile(*output_pwords, file);
     file << endl;
+  }
 
-    file << "\\hidden_biases 2" << endl;
-    second_hidden_linear.write_biases(file);
-    file << endl;
-    
-    file << "\\output_weights" << endl;
-    output_layer.write_weights(file);
-    file << endl;
-    
-    file << "\\output_biases" << endl;
-    output_layer.write_biases(file);
-    file << endl;
-    
-    file << "\\end" << endl;
-    file.close();
+  file << "\\input_embeddings" << endl;
+  input_layer.write(file);
+  file << endl;
+
+  file << "\\hidden_weights 1" << endl;
+  first_hidden_linear.write_weights(file);
+  file << endl;
+
+  file << "\\hidden_biases 1" << endl;
+  first_hidden_linear.write_biases(file);
+  file <<endl;
+
+  file << "\\hidden_weights 2" << endl;
+  second_hidden_linear.write_weights(file);
+  file << endl;
+
+  file << "\\hidden_biases 2" << endl;
+  second_hidden_linear.write_biases(file);
+  file << endl;
+
+  file << "\\output_weights" << endl;
+  output_layer.write_weights(file);
+  file << endl;
+
+  file << "\\output_biases" << endl;
+  output_layer.write_biases(file);
+  file << endl;
+
+  file << "\\end" << endl;
+  file.close();
 }
 
 
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index ee7c3f0..7c86694 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -43,36 +43,36 @@ struct Clipper{
 
 class Linear_layer
 {
-    private:
-        Matrix<double,Dynamic,Dynamic> U;
-        Matrix<double,Dynamic,Dynamic> U_gradient;
-        Matrix<double,Dynamic,Dynamic> U_velocity;
-        Matrix<double,Dynamic,Dynamic> U_running_gradient;
-        Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
-        // Biases
-        Matrix<double,Dynamic,1> b;
-        Matrix<double,Dynamic,1> b_velocity;
-        Matrix<double,Dynamic,1> b_running_gradient;
-        Matrix<double,Dynamic,1> b_running_parameter_update;
-        Matrix<double,Dynamic,1> b_gradient;
-
-    friend class model;
-
-    public:
+ private:
+  Matrix<double,Dynamic,Dynamic> U;
+  Matrix<double,Dynamic,Dynamic> U_gradient;
+  Matrix<double,Dynamic,Dynamic> U_velocity;
+  Matrix<double,Dynamic,Dynamic> U_running_gradient;
+  Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+  // Biases
+  Matrix<double,Dynamic,1> b;
+  Matrix<double,Dynamic,1> b_velocity;
+  Matrix<double,Dynamic,1> b_running_gradient;
+  Matrix<double,Dynamic,1> b_running_parameter_update;
+  Matrix<double,Dynamic,1> b_gradient;
+
+  friend class model;
+
+ public:
   Linear_layer() { }
-        Linear_layer(int rows, int cols) { resize(rows, cols); }
+  Linear_layer(int rows, int cols) { resize(rows, cols); }
 
   void resize(int rows, int cols)
   {
-      U.setZero(rows, cols);
-      U_gradient.setZero(rows, cols);
-      //U_running_gradient.setZero(rows, cols);
-      //U_running_parameter_updates.setZero(rows, cols);
-      //U_velocity.setZero(rows, cols);
-      b.resize(rows);
-      b_gradient.setZero(rows);
-      //b_running_gradient.resize(rows);
-      //b_velocity.resize(rows);
+    U.setZero(rows, cols);
+    U_gradient.setZero(rows, cols);
+    //U_running_gradient.setZero(rows, cols);
+    //U_running_parameter_updates.setZero(rows, cols);
+    //U_velocity.setZero(rows, cols);
+    b.resize(rows);
+    b_gradient.setZero(rows);
+    //b_running_gradient.resize(rows);
+    //b_velocity.resize(rows);
   }
 
   void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
@@ -83,24 +83,24 @@ class Linear_layer
 
   template <typename Engine>
   void initialize(Engine &engine,
-      bool init_normal,
-      double init_range,
-      string &parameter_update,
-      double adagrad_epsilon)
-  {
-      if (parameter_update == "ADA") {
-        U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
-        b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
-      }
-      if (parameter_update == "ADAD") {
-        U_running_gradient.setZero(U.rows(),U.cols());
-        b_running_gradient.setZero(b.size());
-        U_running_parameter_update.setZero(U.rows(),U.cols());
-        b_running_parameter_update.setZero(b.size());
-      }
+                  bool init_normal,
+                  double init_range,
+                  string &parameter_update,
+                  double adagrad_epsilon)
+  {
+    if (parameter_update == "ADA") {
+      U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+      b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+    }
+    if (parameter_update == "ADAD") {
+      U_running_gradient.setZero(U.rows(),U.cols());
+      b_running_gradient.setZero(b.size());
+      U_running_parameter_update.setZero(U.rows(),U.cols());
+      b_running_parameter_update.setZero(b.size());
+    }
 
-      initMatrix(engine, U, init_normal, init_range);
-      initBias(engine, b, init_normal, init_range);
+    initMatrix(engine, U, init_normal, init_range);
+    initBias(engine, b, init_normal, init_range);
   }
 
   int n_inputs () const { return U.cols(); }
@@ -108,655 +108,655 @@ class Linear_layer
 
   template <typename DerivedIn, typename DerivedOut>
   void fProp(const MatrixBase<DerivedIn> &input,
-      const MatrixBase<DerivedOut> &output) const
+             const MatrixBase<DerivedOut> &output) const
   {
-      UNCONST(DerivedOut, output, my_output);
-      my_output.leftCols(input.cols()).noalias() = U*input;
-      int num_examples = input.cols();
-      for (int example = 0;example < num_examples;example++)
-      {
-          my_output.leftCols(input.cols()).col(example) += b;
-      }
+    UNCONST(DerivedOut, output, my_output);
+    my_output.leftCols(input.cols()).noalias() = U*input;
+    int num_examples = input.cols();
+    for (int example = 0;example < num_examples;example++)
+    {
+      my_output.leftCols(input.cols()).col(example) += b;
+    }
   }
 
   // Sparse input
   template <typename ScalarIn, typename DerivedOut>
   void fProp(const USCMatrix<ScalarIn> &input,
-      const MatrixBase<DerivedOut> &output_const) const
-  {
-      UNCONST(DerivedOut, output_const, output);
-      output.setZero();
-      uscgemm(1.0, U, input, output.leftCols(input.cols()));
-      // Each column corresponds to a training example. We
-      // parallelize the adding of biases per dimension.
-      int num_examples = input.cols();
-      for (int example = 0;example < num_examples;example++)
-      {
-          output.leftCols(input.cols()).col(example) += b;
-      }
+             const MatrixBase<DerivedOut> &output_const) const
+  {
+    UNCONST(DerivedOut, output_const, output);
+    output.setZero();
+    uscgemm(1.0, U, input, output.leftCols(input.cols()));
+    // Each column corresponds to a training example. We
+    // parallelize the adding of biases per dimension.
+    int num_examples = input.cols();
+    for (int example = 0;example < num_examples;example++)
+    {
+      output.leftCols(input.cols()).col(example) += b;
+    }
   }
 
   template <typename DerivedGOut, typename DerivedGIn>
   void bProp(const MatrixBase<DerivedGOut> &input,
-      MatrixBase<DerivedGIn> &output) const
+             MatrixBase<DerivedGIn> &output) const
   {
-      UNCONST(DerivedGIn, output, my_output);
-      my_output.noalias() = U.transpose()*input;
+    UNCONST(DerivedGIn, output, my_output);
+    my_output.noalias() = U.transpose()*input;
   }
 
   template <typename DerivedGOut, typename DerivedIn>
   void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
-     const MatrixBase<DerivedIn> &fProp_input,
-     double learning_rate, double momentum, double L2_reg)
+                        const MatrixBase<DerivedIn> &fProp_input,
+                        double learning_rate, double momentum, double L2_reg)
   {
-      U_gradient.noalias() = bProp_input*fProp_input.transpose();
+    U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-      // get the bias gradient for all dimensions in parallel
-      int size = b.size();
-      b_gradient = bProp_input.rowwise().sum();
-      // This used to be multithreaded, but there was no measureable difference
-      if (L2_reg > 0.0)
-      {
-          U_gradient -=  2*L2_reg*U;
-          b_gradient -= 2*L2_reg*b;
-      }
-      if (momentum > 0.0)
-      {
-          U_velocity = momentum*U_velocity + U_gradient;
-          U += learning_rate * U_velocity;
-          b_velocity = momentum*b_velocity + b_gradient;
-          b += learning_rate * b_velocity;
-      }
-      else
-      {
-          U += learning_rate * U_gradient;
-          b += learning_rate * b_gradient;
-          /*
-          //UPDATE CLIPPING
-          U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
-          b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
-          //GRADIENT CLIPPING
-          //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
-          //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
-          */
-      }
+    // get the bias gradient for all dimensions in parallel
+    int size = b.size();
+    b_gradient = bProp_input.rowwise().sum();
+    // This used to be multithreaded, but there was no measureable difference
+    if (L2_reg > 0.0)
+    {
+      U_gradient -=  2*L2_reg*U;
+      b_gradient -= 2*L2_reg*b;
+    }
+    if (momentum > 0.0)
+    {
+      U_velocity = momentum*U_velocity + U_gradient;
+      U += learning_rate * U_velocity;
+      b_velocity = momentum*b_velocity + b_gradient;
+      b += learning_rate * b_velocity;
+    }
+    else
+    {
+      U += learning_rate * U_gradient;
+      b += learning_rate * b_gradient;
+      /*
+      //UPDATE CLIPPING
+      U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+      b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+      //GRADIENT CLIPPING
+      //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+      //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+      */
+    }
   }
 
   template <typename DerivedGOut, typename DerivedIn>
   void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
-      const MatrixBase<DerivedIn> &fProp_input,
-      double learning_rate,
-      double L2_reg)
+                              const MatrixBase<DerivedIn> &fProp_input,
+                              double learning_rate,
+                              double L2_reg)
   {
-      U_gradient.noalias() = bProp_input*fProp_input.transpose();
+    U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
 
-      // get the bias gradient for all dimensions in parallel
-      int size = b.size();
-      b_gradient.noalias() = bProp_input.rowwise().sum();
+    // get the bias gradient for all dimensions in parallel
+    int size = b.size();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
 
-      if (L2_reg != 0)
-      {
-          U_gradient -=  2*L2_reg*U;
-          b_gradient -= 2*L2_reg*b;
-      }
+    if (L2_reg != 0)
+    {
+      U_gradient -=  2*L2_reg*U;
+      b_gradient -= 2*L2_reg*b;
+    }
 
-      // ignore momentum?
-      #pragma omp parallel for
-      for (int col=0; col<U.cols(); col++) {
-        U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
-        U.col(col) += learning_rate * (U_gradient.col(col).array() /
-                  U_running_gradient.col(col).array().sqrt()).matrix();
-        /*
-        //UPDATE CLIPPING
-        U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
-              unaryExpr(Clipper()).matrix();
-        */
-      }
-      b_running_gradient += b_gradient.array().square().matrix();
-      b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+    // ignore momentum?
+#pragma omp parallel for
+    for (int col=0; col<U.cols(); col++) {
+      U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+      U.col(col) += learning_rate * (U_gradient.col(col).array() /
+                                     U_running_gradient.col(col).array().sqrt()).matrix();
       /*
       //UPDATE CLIPPING
-      b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+      U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+      unaryExpr(Clipper()).matrix();
       */
+    }
+    b_running_gradient += b_gradient.array().square().matrix();
+    b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+    /*
+    //UPDATE CLIPPING
+    b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+    */
   }
 
   template <typename DerivedGOut, typename DerivedIn>
   void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
-      const MatrixBase<DerivedIn> &fProp_input,
-      double learning_rate,
-      double L2_reg,
-      double conditioning_constant,
-      double decay)
+                               const MatrixBase<DerivedIn> &fProp_input,
+                               double learning_rate,
+                               double L2_reg,
+                               double conditioning_constant,
+                               double decay)
   {
-      //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
-      U_gradient.noalias() = bProp_input*fProp_input.transpose();
+    //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+    U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-      Array<double,Dynamic,1> b_current_parameter_update;
+    Array<double,Dynamic,1> b_current_parameter_update;
 
-      // get the bias gradient for all dimensions in parallel
-      int size = b.size();
-      b_gradient.noalias() = bProp_input.rowwise().sum();
+    // get the bias gradient for all dimensions in parallel
+    int size = b.size();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
 
-      if (L2_reg != 0)
-      {
-          U_gradient -=  2*L2_reg*U;
-          b_gradient -= 2*L2_reg*b;
-      }
+    if (L2_reg != 0)
+    {
+      U_gradient -=  2*L2_reg*U;
+      b_gradient -= 2*L2_reg*b;
+    }
 
-      // ignore momentum?
-      #pragma omp parallel for
-      //cerr<<"U gradient is "<<U_gradient<<endl;
-      for (int col=0; col<U.cols(); col++) {
-        Array<double,Dynamic,1> U_current_parameter_update;
-        U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
-                            (1-decay)*U_gradient.col(col).array().square().matrix();
-        //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
-        //getchar();
-        U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
-                                      (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
-                                      U_gradient.col(col).array();
-        //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
-        //getchar();
-        //update the running parameter update
-        U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
-                                          (1.-decay)*U_current_parameter_update.square().matrix();
-        U.col(col) += learning_rate*U_current_parameter_update.matrix();
-      }
-      b_running_gradient = decay*b_running_gradient +
-                        (1.-decay)*b_gradient.array().square().matrix();
-      b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
-                                   (b_running_gradient.array()+conditioning_constant).sqrt()) *
-                                  b_gradient.array();
-      b_running_parameter_update = decay*(b_running_parameter_update) +
-                                (1.-decay)*b_current_parameter_update.square().matrix();
-      b += learning_rate*b_current_parameter_update.matrix();
+    // ignore momentum?
+#pragma omp parallel for
+    //cerr<<"U gradient is "<<U_gradient<<endl;
+    for (int col=0; col<U.cols(); col++) {
+      Array<double,Dynamic,1> U_current_parameter_update;
+      U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
+          (1-decay)*U_gradient.col(col).array().square().matrix();
+      //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+      //getchar();
+      U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+                                    (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+          U_gradient.col(col).array();
+      //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+      //getchar();
+      //update the running parameter update
+      U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+          (1.-decay)*U_current_parameter_update.square().matrix();
+      U.col(col) += learning_rate*U_current_parameter_update.matrix();
+    }
+    b_running_gradient = decay*b_running_gradient +
+        (1.-decay)*b_gradient.array().square().matrix();
+    b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                  (b_running_gradient.array()+conditioning_constant).sqrt()) *
+        b_gradient.array();
+    b_running_parameter_update = decay*(b_running_parameter_update) +
+        (1.-decay)*b_current_parameter_update.square().matrix();
+    b += learning_rate*b_current_parameter_update.matrix();
   }
 
 
   template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
   void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
-    const MatrixBase<DerivedIn> &fProp_input,
-    const MatrixBase<DerivedGW> &gradient) const
+                            const MatrixBase<DerivedIn> &fProp_input,
+                            const MatrixBase<DerivedGW> &gradient) const
   {
-      UNCONST(DerivedGW, gradient, my_gradient);
-      my_gradient.noalias() = bProp_input*fProp_input.transpose();
+    UNCONST(DerivedGW, gradient, my_gradient);
+    my_gradient.noalias() = bProp_input*fProp_input.transpose();
   }
 };
 
 class Output_word_embeddings
 {
-    private:
-        // row-major is better for uscgemm
-        //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
-        // Having W be a pointer to a matrix allows ease of sharing
-        // input and output word embeddings
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
-        std::vector<double> W_data;
-        Matrix<double,Dynamic,1> b;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
-        Matrix<double,Dynamic,1> b_running_gradient;
-        Matrix<double,Dynamic,1> b_gradient;
-        Matrix<double,Dynamic,1> b_running_parameter_update;
-
-    public:
-        Output_word_embeddings() { }
-        Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
-
-        void resize(int rows, int cols)
-        {
-          W->setZero(rows, cols);
-          b.setZero(rows);
-        }
-    void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
-      W = input_W;
-    }
-    void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
-    void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
-    void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
-    void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
-
-    template <typename Engine>
-    void initialize(Engine &engine,
-        bool init_normal,
-        double init_range,
-        double init_bias,
-        string &parameter_update,
-        double adagrad_epsilon)
-    {
+ private:
+  // row-major is better for uscgemm
+  //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
+  // Having W be a pointer to a matrix allows ease of sharing
+  // input and output word embeddings
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+  std::vector<double> W_data;
+  Matrix<double,Dynamic,1> b;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+  Matrix<double,Dynamic,1> b_running_gradient;
+  Matrix<double,Dynamic,1> b_gradient;
+  Matrix<double,Dynamic,1> b_running_parameter_update;
+
+ public:
+  Output_word_embeddings() { }
+  Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
 
-        W_gradient.setZero(W->rows(),W->cols());
-        b_gradient.setZero(b.size());
-        if (parameter_update == "ADA") {
-          W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
-          b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
-          //W_gradient.setZero(W->rows(),W->cols());
-          //b_gradient.setZero(b.size());
-        }
-        if (parameter_update == "ADAD") {
-          W_running_gradient.setZero(W->rows(),W->cols());
-          b_running_gradient.setZero(b.size());
-          W_gradient.setZero(W->rows(),W->cols());
-          //b_gradient.setZero(b.size());
-          //W_running_parameter_update.setZero(W->rows(),W->cols());
-          b_running_parameter_update.setZero(b.size());
-        }
-
-        initMatrix(engine, *W, init_normal, init_range);
-        b.fill(init_bias);
-    }
-
-    int n_inputs () const { return W->cols(); }
-    int n_outputs () const { return W->rows(); }
-
-    template <typename DerivedIn, typename DerivedOut>
-    void fProp(const MatrixBase<DerivedIn> &input,
-    const MatrixBase<DerivedOut> &output) const
-    {
-        UNCONST(DerivedOut, output, my_output);
-        my_output = ((*W) * input).colwise() + b;
+  void resize(int rows, int cols)
+  {
+    W->setZero(rows, cols);
+    b.setZero(rows);
+  }
+  void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+    W = input_W;
+  }
+  void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
+  void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+  void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+  void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
+
+  template <typename Engine>
+  void initialize(Engine &engine,
+                  bool init_normal,
+                  double init_range,
+                  double init_bias,
+                  string &parameter_update,
+                  double adagrad_epsilon)
+  {
+
+    W_gradient.setZero(W->rows(),W->cols());
+    b_gradient.setZero(b.size());
+    if (parameter_update == "ADA") {
+      W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+      b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+      //W_gradient.setZero(W->rows(),W->cols());
+      //b_gradient.setZero(b.size());
+    }
+    if (parameter_update == "ADAD") {
+      W_running_gradient.setZero(W->rows(),W->cols());
+      b_running_gradient.setZero(b.size());
+      W_gradient.setZero(W->rows(),W->cols());
+      //b_gradient.setZero(b.size());
+      //W_running_parameter_update.setZero(W->rows(),W->cols());
+      b_running_parameter_update.setZero(b.size());
     }
 
+    initMatrix(engine, *W, init_normal, init_range);
+    b.fill(init_bias);
+  }
+
+  int n_inputs () const { return W->cols(); }
+  int n_outputs () const { return W->rows(); }
+
+  template <typename DerivedIn, typename DerivedOut>
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOut> &output) const
+  {
+    UNCONST(DerivedOut, output, my_output);
+    my_output = ((*W) * input).colwise() + b;
+  }
+
   // Sparse output version
-    template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
-    void fProp(const MatrixBase<DerivedIn> &input,
-    const MatrixBase<DerivedOutI> &samples,
-    const MatrixBase<DerivedOutV> &output) const
-    {
-        UNCONST(DerivedOutV, output, my_output);
-        #pragma omp parallel for
-        for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
-        {
-          for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
-          {
-            my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
-          }
-        }
-        USCMatrix<double> sparse_output(W->rows(), samples, my_output);
-        uscgemm_masked(1.0, *W, input, sparse_output);
-        my_output = sparse_output.values; // too bad, so much copying
-    }
-
-    // Return single element of output matrix
-    template <typename DerivedIn>
-    double fProp(const MatrixBase<DerivedIn> &input,
-           int word,
-           int instance) const
+  template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOutI> &samples,
+             const MatrixBase<DerivedOutV> &output) const
+  {
+    UNCONST(DerivedOutV, output, my_output);
+#pragma omp parallel for
+    for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
     {
-        return W->row(word).dot(input.col(instance)) + b(word);
+      for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+      {
+        my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+      }
     }
+    USCMatrix<double> sparse_output(W->rows(), samples, my_output);
+    uscgemm_masked(1.0, *W, input, sparse_output);
+    my_output = sparse_output.values; // too bad, so much copying
+  }
 
-    // Dense versions (for log-likelihood loss)
+  // Return single element of output matrix
+  template <typename DerivedIn>
+  double fProp(const MatrixBase<DerivedIn> &input,
+               int word,
+               int instance) const
+  {
+    return W->row(word).dot(input.col(instance)) + b(word);
+  }
 
-    template <typename DerivedGOut, typename DerivedGIn>
-    void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
-    const MatrixBase<DerivedGIn> &bProp_matrix) const
-    {
-      // W is vocab_size x output_embedding_dimension
-      // input_bProp_matrix is vocab_size x minibatch_size
-      // bProp_matrix is output_embedding_dimension x minibatch_size
-      UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
-      my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
+  // Dense versions (for log-likelihood loss)
+
+  template <typename DerivedGOut, typename DerivedGIn>
+  void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
+             const MatrixBase<DerivedGIn> &bProp_matrix) const
+  {
+    // W is vocab_size x output_embedding_dimension
+    // input_bProp_matrix is vocab_size x minibatch_size
+    // bProp_matrix is output_embedding_dimension x minibatch_size
+    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+    my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
         W->transpose() * input_bProp_matrix;
-    }
+  }
 
-    template <typename DerivedIn, typename DerivedGOut>
-          void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
-             const MatrixBase<DerivedGOut> &bProp_input,
-             double learning_rate,
-             double momentum) //not sure if we want   to use momentum here
-    {
-        // W is vocab_size x output_embedding_dimension
-        // b is vocab_size x 1
-        // predicted_embeddings is output_embedding_dimension x minibatch_size
-        // bProp_input is vocab_size x minibatch_size
-        W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
-        b += learning_rate * bProp_input.rowwise().sum();
-
-        /*
-        //GRADIENT CLIPPING
-        W->noalias() += learning_rate *
-          ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
-        b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
-        //UPDATE CLIPPING
-        W->noalias() += (learning_rate *
-        (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
-        b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
-        */
-    }
-
-    template <typename DerivedIn, typename DerivedGOut>
-          void computeGradientAdagrad(
-             const MatrixBase<DerivedIn> &predicted_embeddings,
-             const MatrixBase<DerivedGOut> &bProp_input,
-             double learning_rate) //not sure if we want to use momentum here
-    {
-        // W is vocab_size x output_embedding_dimension
-        // b is vocab_size x 1
-        // predicted_embeddings is output_embedding_dimension x minibatch_size
-        // bProp_input is vocab_size x minibatch_sizea
-        W_gradient.setZero(W->rows(), W->cols());
-        b_gradient.setZero(b.size());
-        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
-        b_gradient.noalias() = bProp_input.rowwise().sum();
-        W_running_gradient += W_gradient.array().square().matrix();
-        b_running_gradient += b_gradient.array().square().matrix();
-        W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
-        b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
-        /*
-        //UPDATE CLIPPING
-        *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
-        b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
-        */
-    }
-
-    template <typename DerivedIn, typename DerivedGOut>
-          void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
-             const MatrixBase<DerivedGOut> &bProp_input,
-             double learning_rate,
-             double conditioning_constant,
-             double decay) //not sure if we want to use momentum here
-    {
-        // W is vocab_size x output_embedding_dimension
-        // b is vocab_size x 1
-        // predicted_embeddings is output_embedding_dimension x minibatch_size
-        // bProp_input is vocab_size x minibatch_size
-        Array<double,Dynamic,Dynamic> W_current_parameter_update;
-        Array<double,Dynamic,1> b_current_parameter_update;
-        W_gradient.setZero(W->rows(), W->cols());
-        b_gradient.setZero(b.size());
-        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
-        b_gradient.noalias() = bProp_input.rowwise().sum();
-        W_running_gradient = decay*W_running_gradient +
-                            (1.-decay)*W_gradient.array().square().matrix();
-        b_running_gradient = decay*b_running_gradient+
-                            (1.-decay)*b_gradient.array().square().matrix();
-        W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
-                                     (W_running_gradient.array()+conditioning_constant).sqrt())*
-                                      W_gradient.array();
-        b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
-                                     (b_running_gradient.array()+conditioning_constant).sqrt())*
-                                     b_gradient.array();
-        W_running_parameter_update = decay*W_running_parameter_update +
-                                    (1.-decay)*W_current_parameter_update.square().matrix();
-        b_running_parameter_update = decay*b_running_parameter_update +
-                                    (1.-decay)*b_current_parameter_update.square().matrix();
-
-        *W += learning_rate*W_current_parameter_update.matrix();
-        b += learning_rate*b_current_parameter_update.matrix();
-    }
-
-    // Sparse versions
-
-    template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
-    void bProp(const MatrixBase<DerivedGOutI> &samples,
-    const MatrixBase<DerivedGOutV> &weights,
-    const MatrixBase<DerivedGIn> &bProp_matrix) const
-    {
-        UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
-        my_bProp_matrix.setZero();
-        uscgemm(1.0,
+  template <typename DerivedIn, typename DerivedGOut>
+  void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+                       const MatrixBase<DerivedGOut> &bProp_input,
+                       double learning_rate,
+                       double momentum) //not sure if we want   to use momentum here
+  {
+    // W is vocab_size x output_embedding_dimension
+    // b is vocab_size x 1
+    // predicted_embeddings is output_embedding_dimension x minibatch_size
+    // bProp_input is vocab_size x minibatch_size
+    W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
+    b += learning_rate * bProp_input.rowwise().sum();
+
+    /*
+    //GRADIENT CLIPPING
+    W->noalias() += learning_rate *
+    ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+    b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+    //UPDATE CLIPPING
+    W->noalias() += (learning_rate *
+    (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+    b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+    */
+  }
+
+  template <typename DerivedIn, typename DerivedGOut>
+  void computeGradientAdagrad(
+      const MatrixBase<DerivedIn> &predicted_embeddings,
+      const MatrixBase<DerivedGOut> &bProp_input,
+      double learning_rate) //not sure if we want to use momentum here
+  {
+    // W is vocab_size x output_embedding_dimension
+    // b is vocab_size x 1
+    // predicted_embeddings is output_embedding_dimension x minibatch_size
+    // bProp_input is vocab_size x minibatch_sizea
+    W_gradient.setZero(W->rows(), W->cols());
+    b_gradient.setZero(b.size());
+    W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
+    W_running_gradient += W_gradient.array().square().matrix();
+    b_running_gradient += b_gradient.array().square().matrix();
+    W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+    b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+    /*
+    //UPDATE CLIPPING
+    *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+    b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+    */
+  }
+
+  template <typename DerivedIn, typename DerivedGOut>
+  void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+                               const MatrixBase<DerivedGOut> &bProp_input,
+                               double learning_rate,
+                               double conditioning_constant,
+                               double decay) //not sure if we want to use momentum here
+  {
+    // W is vocab_size x output_embedding_dimension
+    // b is vocab_size x 1
+    // predicted_embeddings is output_embedding_dimension x minibatch_size
+    // bProp_input is vocab_size x minibatch_size
+    Array<double,Dynamic,Dynamic> W_current_parameter_update;
+    Array<double,Dynamic,1> b_current_parameter_update;
+    W_gradient.setZero(W->rows(), W->cols());
+    b_gradient.setZero(b.size());
+    W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+    b_gradient.noalias() = bProp_input.rowwise().sum();
+    W_running_gradient = decay*W_running_gradient +
+        (1.-decay)*W_gradient.array().square().matrix();
+    b_running_gradient = decay*b_running_gradient+
+        (1.-decay)*b_gradient.array().square().matrix();
+    W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                  (W_running_gradient.array()+conditioning_constant).sqrt())*
+        W_gradient.array();
+    b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                  (b_running_gradient.array()+conditioning_constant).sqrt())*
+        b_gradient.array();
+    W_running_parameter_update = decay*W_running_parameter_update +
+        (1.-decay)*W_current_parameter_update.square().matrix();
+    b_running_parameter_update = decay*b_running_parameter_update +
+        (1.-decay)*b_current_parameter_update.square().matrix();
+
+    *W += learning_rate*W_current_parameter_update.matrix();
+    b += learning_rate*b_current_parameter_update.matrix();
+  }
+
+  // Sparse versions
+
+  template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
+  void bProp(const MatrixBase<DerivedGOutI> &samples,
+             const MatrixBase<DerivedGOutV> &weights,
+             const MatrixBase<DerivedGIn> &bProp_matrix) const
+  {
+    UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+    my_bProp_matrix.setZero();
+    uscgemm(1.0,
             W->transpose(),
             USCMatrix<double>(W->rows(), samples, weights),
             my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
-    }
+  }
 
   template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
-        void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
-           const MatrixBase<DerivedGOutI> &samples,
-           const MatrixBase<DerivedGOutV> &weights,
-           double learning_rate, double momentum) //not sure if we want to use momentum here
+  void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+                       const MatrixBase<DerivedGOutI> &samples,
+                       const MatrixBase<DerivedGOutV> &weights,
+                       double learning_rate, double momentum) //not sure if we want to use momentum here
   {
-      //cerr<<"in gradient"<<endl;
-      USCMatrix<double> gradient_output(W->rows(), samples, weights);
-      uscgemm(learning_rate,
-          gradient_output,
-          predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
-          *W); // narrow predicted_embeddings for possible short minibatch
-      uscgemv(learning_rate,
-          gradient_output,
-          Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
-          b);
-      /*
-      //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
-      //FIRST
-      USCMatrix<double> gradient_output(W->rows(), samples, weights);
-      uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          W_gradient);
-      uscgemv(1.0,
-          gradient_output,
-          Matrix<double,Dynamic,1>::Ones(weights.cols()),
-          b_gradient);
-
-      int_map update_map; //stores all the parameters that have been updated
-      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-          for (int train_id=0; train_id<samples.cols(); train_id++)
-              update_map[samples(sample_id, train_id)] = 1;
-
-      // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            update_items.push_back(it->first);
-        int num_items = update_items.size();
-
-        //#pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            //W->row(update_item) += learning_rate * W_gradient.row(update_item);
-            //b(update_item) += learning_rate * b_gradient(update_item);
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
-            double update = learning_rate * b_gradient(update_item);
-            b(update_item) += std::min(0.5, std::max(update,-0.5));
-            //GRADIENT CLIPPING
-            W_gradient.row(update_item).setZero();
-            b_gradient(update_item) = 0.;
-        }
-        */
-      //cerr<<"Finished gradient"<<endl;
+    //cerr<<"in gradient"<<endl;
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(learning_rate,
+            gradient_output,
+            predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
+            *W); // narrow predicted_embeddings for possible short minibatch
+    uscgemv(learning_rate,
+            gradient_output,
+            Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+            b);
+    /*
+    //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+    //FIRST
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+    gradient_output,
+    predicted_embeddings.leftCols(samples.cols()).transpose(),
+    W_gradient);
+    uscgemv(1.0,
+    gradient_output,
+    Matrix<double,Dynamic,1>::Ones(weights.cols()),
+    b_gradient);
+
+    int_map update_map; //stores all the parameters that have been updated
+    for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+    for (int train_id=0; train_id<samples.cols(); train_id++)
+    update_map[samples(sample_id, train_id)] = 1;
+
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+    update_items.push_back(it->first);
+    int num_items = update_items.size();
+
+    //#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+    int update_item = update_items[item_id];
+    //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+    //b(update_item) += learning_rate * b_gradient(update_item);
+    //UPDATE CLIPPING
+    W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+    double update = learning_rate * b_gradient(update_item);
+    b(update_item) += std::min(0.5, std::max(update,-0.5));
+    //GRADIENT CLIPPING
+    W_gradient.row(update_item).setZero();
+    b_gradient(update_item) = 0.;
+    }
+    */
+    //cerr<<"Finished gradient"<<endl;
   }
 
   template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
-        void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
-            const MatrixBase<DerivedGOutI> &samples,
-            const MatrixBase<DerivedGOutV> &weights,
-            double learning_rate) //not sure if we want to use momentum here
-        {
-      //W_gradient.setZero(W->rows(), W->cols());
-      //b_gradient.setZero(b.size());
-      //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
-      USCMatrix<double> gradient_output(W->rows(), samples, weights);
-      uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          W_gradient);
-      uscgemv(1.0,
-          gradient_output,
-          Matrix<double,Dynamic,1>::Ones(weights.cols()),
-          b_gradient);
-
-      int_map update_map; //stores all the parameters that have been updated
-      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-          for (int train_id=0; train_id<samples.cols(); train_id++)
-              update_map[samples(sample_id, train_id)] = 1;
-
-      // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            update_items.push_back(it->first);
-        int num_items = update_items.size();
-
-        //#pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
-            b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
-            W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
-            b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
-            /*
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
-            double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
-            b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
-            */
-            W_gradient.row(update_item).setZero();
-            b_gradient(update_item) = 0.;
-        }
-      }
+  void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
+                              const MatrixBase<DerivedGOutI> &samples,
+                              const MatrixBase<DerivedGOutV> &weights,
+                              double learning_rate) //not sure if we want to use momentum here
+  {
+    //W_gradient.setZero(W->rows(), W->cols());
+    //b_gradient.setZero(b.size());
+    //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+            gradient_output,
+            predicted_embeddings.leftCols(samples.cols()).transpose(),
+            W_gradient);
+    uscgemv(1.0,
+            gradient_output,
+            Matrix<double,Dynamic,1>::Ones(weights.cols()),
+            b_gradient);
+
+    int_map update_map; //stores all the parameters that have been updated
+    for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+      for (int train_id=0; train_id<samples.cols(); train_id++)
+        update_map[samples(sample_id, train_id)] = 1;
+
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+      update_items.push_back(it->first);
+    int num_items = update_items.size();
+
+    //#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+      b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
+      W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+      b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+      /*
+      //UPDATE CLIPPING
+      W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+      double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+      b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+      */
+      W_gradient.row(update_item).setZero();
+      b_gradient(update_item) = 0.;
+    }
+  }
 
   template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
-        void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
-            const MatrixBase<DerivedGOutI> &samples,
-            const MatrixBase<DerivedGOutV> &weights,
-            double learning_rate,
-            double conditioning_constant,
-            double decay) //not sure if we want to use momentum here
-        {
-          //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
-      //W_gradient.setZero(W->rows(), W->cols());
-      //b_gradient.setZero(b.size());
-
-      USCMatrix<double> gradient_output(W->rows(), samples, weights);
-      uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          W_gradient);
-      uscgemv(1.0,
-          gradient_output,
-          Matrix<double,Dynamic,1>::Ones(weights.cols()),
-          b_gradient);
-
-      int_map update_map; //stores all the parameters that have been updated
-      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
-          for (int train_id=0; train_id<samples.cols(); train_id++)
-              update_map[samples(sample_id, train_id)] = 1;
-
-      // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            update_items.push_back(it->first);
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            Array<double,1,Dynamic> W_current_parameter_update;
-            double b_current_parameter_update;
-
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
-                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
-            b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
-                                            (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
-            //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
-            //getchar();
-
-            //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
-            //getchar();
-            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
-                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
-                                         W_gradient.row(update_item).array();
-            b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
-                                         sqrt(b_running_gradient(update_item)+conditioning_constant))*
-                                         b_gradient(update_item);
-            //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
-            //getchar();
-            //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
-            //getchar();
-            //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
-            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
-                                                         (1.-decay)*(W_current_parameter_update.square().matrix());
-            b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
-                                                      (1.-decay)*b_current_parameter_update*b_current_parameter_update;
-            //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
-            //getchar();
-            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
-            b(update_item) += learning_rate*b_current_parameter_update;
-            W_gradient.row(update_item).setZero();
-            b_gradient(update_item) = 0.;
-        }
-      }
+  void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+                               const MatrixBase<DerivedGOutI> &samples,
+                               const MatrixBase<DerivedGOutV> &weights,
+                               double learning_rate,
+                               double conditioning_constant,
+                               double decay) //not sure if we want to use momentum here
+  {
+    //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+    //W_gradient.setZero(W->rows(), W->cols());
+    //b_gradient.setZero(b.size());
+
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+            gradient_output,
+            predicted_embeddings.leftCols(samples.cols()).transpose(),
+            W_gradient);
+    uscgemv(1.0,
+            gradient_output,
+            Matrix<double,Dynamic,1>::Ones(weights.cols()),
+            b_gradient);
+
+    int_map update_map; //stores all the parameters that have been updated
+    for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+      for (int train_id=0; train_id<samples.cols(); train_id++)
+        update_map[samples(sample_id, train_id)] = 1;
+
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+      update_items.push_back(it->first);
+    int num_items = update_items.size();
+
+#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+      Array<double,1,Dynamic> W_current_parameter_update;
+      double b_current_parameter_update;
+
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+          (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+      b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+          (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+      //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+      //getchar();
+
+      //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+      //getchar();
+      W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                    (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+          W_gradient.row(update_item).array();
+      b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+                                    sqrt(b_running_gradient(update_item)+conditioning_constant))*
+          b_gradient(update_item);
+      //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+      //getchar();
+      //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+      //getchar();
+      //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+      W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+          (1.-decay)*(W_current_parameter_update.square().matrix());
+      b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+          (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+      //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+      //getchar();
+      W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+      b(update_item) += learning_rate*b_current_parameter_update;
+      W_gradient.row(update_item).setZero();
+      b_gradient(update_item) = 0.;
+    }
+  }
 
 
   template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
-    void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
-      const MatrixBase<DerivedGOutI> &samples,
-      const MatrixBase<DerivedGOutV> &weights,
-      const MatrixBase<DerivedGW> &gradient_W,
-      const MatrixBase<DerivedGb> &gradient_b) const
-  {
-      UNCONST(DerivedGW, gradient_W, my_gradient_W);
-      UNCONST(DerivedGb, gradient_b, my_gradient_b);
-      my_gradient_W.setZero();
-      my_gradient_b.setZero();
-      USCMatrix<double> gradient_output(W->rows(), samples, weights);
-      uscgemm(1.0,
-          gradient_output,
-          predicted_embeddings.leftCols(samples.cols()).transpose(),
-          my_gradient_W);
-      uscgemv(1.0, gradient_output,
-        Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+  void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
+                            const MatrixBase<DerivedGOutI> &samples,
+                            const MatrixBase<DerivedGOutV> &weights,
+                            const MatrixBase<DerivedGW> &gradient_W,
+                            const MatrixBase<DerivedGb> &gradient_b) const
+  {
+    UNCONST(DerivedGW, gradient_W, my_gradient_W);
+    UNCONST(DerivedGb, gradient_b, my_gradient_b);
+    my_gradient_W.setZero();
+    my_gradient_b.setZero();
+    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+    uscgemm(1.0,
+            gradient_output,
+            predicted_embeddings.leftCols(samples.cols()).transpose(),
+            my_gradient_W);
+    uscgemv(1.0, gradient_output,
+            Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
   }
 };
 
 class Input_word_embeddings
 {
-    private:
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
-        int context_size, vocab_size;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
-        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ private:
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+  int context_size, vocab_size;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+  Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
 
   friend class model;
 
-    public:
-        Input_word_embeddings() : context_size(0), vocab_size(0) { }
-        Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
+ public:
+  Input_word_embeddings() : context_size(0), vocab_size(0) { }
+  Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
 
-      void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
-        W = input_W;
-      }
+  void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+    W = input_W;
+  }
 
-      void resize(int rows, int cols, int context)
-      {
-        context_size = context;
-        vocab_size = rows;
-        W->setZero(rows, cols);
-      }
+  void resize(int rows, int cols, int context)
+  {
+    context_size = context;
+    vocab_size = rows;
+    W->setZero(rows, cols);
+  }
 
-        void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
-        void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+  void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
+  void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
 
-      template <typename Engine>
-      void initialize(Engine &engine,
-          bool init_normal,
-          double init_range,
-          string &parameter_update,
-          double adagrad_epsilon)
-      {
-          W_gradient.setZero(W->rows(),W->cols());
-
-          if (parameter_update == "ADA") {
-            W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
-            //W_gradient.setZero(W->rows(),W->cols());
-          }
-        if (parameter_update == "ADAD") {
-          W_running_gradient.setZero(W->rows(),W->cols());
-          //W_gradient.setZero(W->rows(),W->cols());
-          W_running_parameter_update.setZero(W->rows(),W->cols());
-        }
-        initMatrix(engine,
-            *W,
-            init_normal,
-            init_range);
-      }
+  template <typename Engine>
+  void initialize(Engine &engine,
+                  bool init_normal,
+                  double init_range,
+                  string &parameter_update,
+                  double adagrad_epsilon)
+  {
+    W_gradient.setZero(W->rows(),W->cols());
+
+    if (parameter_update == "ADA") {
+      W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+      //W_gradient.setZero(W->rows(),W->cols());
+    }
+    if (parameter_update == "ADAD") {
+      W_running_gradient.setZero(W->rows(),W->cols());
+      //W_gradient.setZero(W->rows(),W->cols());
+      W_running_parameter_update.setZero(W->rows(),W->cols());
+    }
+    initMatrix(engine,
+               *W,
+               init_normal,
+               init_range);
+  }
 
   int n_inputs() const { return -1; }
   int n_outputs() const { return W->cols() * context_size; }
@@ -765,40 +765,40 @@ class Input_word_embeddings
   template <typename Dist>
   void average(const Dist &dist, int output_id)
   {
-      W->row(output_id).setZero();
-      for (int i=0; i < W->rows(); i++)
-          if (i != output_id)
+    W->row(output_id).setZero();
+    for (int i=0; i < W->rows(); i++)
+      if (i != output_id)
         W->row(output_id) += dist.prob(i) * W->row(i);
   }
 
   template <typename DerivedIn, typename DerivedOut>
-        void fProp(const MatrixBase<DerivedIn> &input,
-       const MatrixBase<DerivedOut> &output) const
-        {
-            int embedding_dimension = W->cols();
+  void fProp(const MatrixBase<DerivedIn> &input,
+             const MatrixBase<DerivedOut> &output) const
+  {
+    int embedding_dimension = W->cols();
 
-      // W      is vocab_size                        x embedding_dimension
-      // input  is ngram_size*vocab_size             x minibatch_size
-      // output is ngram_size*embedding_dimension x minibatch_size
+    // W      is vocab_size                        x embedding_dimension
+    // input  is ngram_size*vocab_size             x minibatch_size
+    // output is ngram_size*embedding_dimension x minibatch_size
 
-      /*
-      // Dense version:
-      for (int ngram=0; ngram<context_size; ngram++)
-          output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
-      */
+    /*
+    // Dense version:
+    for (int ngram=0; ngram<context_size; ngram++)
+    output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
+    */
 
-      UNCONST(DerivedOut, output, my_output);
-      my_output.setZero();
-      for (int ngram=0; ngram<context_size; ngram++)
-      {
-          // input might be narrower than expected due to a short minibatch,
-          // so narrow output to match
-          uscgemm(1.0,
-            W->transpose(),
-            USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
-            my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
-      }
-        }
+    UNCONST(DerivedOut, output, my_output);
+    my_output.setZero();
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      // input might be narrower than expected due to a short minibatch,
+      // so narrow output to match
+      uscgemm(1.0,
+              W->transpose(),
+              USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
+              my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
+    }
+  }
 
   // When model is premultiplied, this layer doesn't get used,
   // but this method is used to get the input into a sparse matrix.
@@ -814,206 +814,206 @@ class Input_word_embeddings
 
   template <typename DerivedGOut, typename DerivedIn>
   void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
-     const MatrixBase<DerivedIn> &input_words,
-     double learning_rate, double momentum, double L2_reg)
+                       const MatrixBase<DerivedIn> &input_words,
+                       double learning_rate, double momentum, double L2_reg)
   {
-      int embedding_dimension = W->cols();
+    int embedding_dimension = W->cols();
 
-      // W           is vocab_size                        x embedding_dimension
-      // input       is ngram_size*vocab_size             x minibatch_size
-      // bProp_input is ngram_size*embedding_dimension x minibatch_size
+    // W           is vocab_size                        x embedding_dimension
+    // input       is ngram_size*vocab_size             x minibatch_size
+    // bProp_input is ngram_size*embedding_dimension x minibatch_size
 
-      /*
-      // Dense version:
-      for (int ngram=0; ngram<context_size; ngram++)
-          W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
-      */
+    /*
+    // Dense version:
+    for (int ngram=0; ngram<context_size; ngram++)
+    W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
+    */
 
-      for (int ngram=0; ngram<context_size; ngram++)
-      {
-          uscgemm(learning_rate,
-      USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-      bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
-            *W);
-      }
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      uscgemm(learning_rate,
+              USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
+              *W);
+    }
 
-      /*
-      //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
-      //PERFORM CLIPPING WHILE UPDATING
+    /*
+    //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+    //PERFORM CLIPPING WHILE UPDATING
 
-      for (int ngram=0; ngram<context_size; ngram++)
-      {
-        uscgemm(1.0,
-          USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-          bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-          W_gradient);
-      }
-      int_map update_map; //stores all the parameters that have been updated
-      for (int ngram=0; ngram<context_size; ngram++)
-      {
-        for (int train_id=0; train_id<input_words.cols(); train_id++)
-        {
-          update_map[input_words(ngram,train_id)] = 1;
-        }
-      }
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+    uscgemm(1.0,
+    USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+    bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+    W_gradient);
+    }
+    int_map update_map; //stores all the parameters that have been updated
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+    for (int train_id=0; train_id<input_words.cols(); train_id++)
+    {
+    update_map[input_words(ngram,train_id)] = 1;
+    }
+    }
 
-      // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-        {
-            update_items.push_back(it->first);
-        }
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate*
-                W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
-            //GRADIENT CLIPPING
-            //W->row(update_item) += learning_rate*
-            //    W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
-            //SETTING THE GRADIENT TO ZERO
-            W_gradient.row(update_item).setZero();
-        }
-      */
-  }
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+    {
+    update_items.push_back(it->first);
+    }
+    int num_items = update_items.size();
 
-    template <typename DerivedGOut, typename DerivedIn>
-    void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
-            const MatrixBase<DerivedIn> &input_words,
-            double learning_rate,
-            double L2_reg)
+    #pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
     {
-            int embedding_dimension = W->cols();
-      //W_gradient.setZero(W->rows(), W->cols());
-      /*
+    int update_item = update_items[item_id];
+    //UPDATE CLIPPING
+    W->row(update_item) += (learning_rate*
+    W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+    //GRADIENT CLIPPING
+    //W->row(update_item) += learning_rate*
+    //    W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+    //SETTING THE GRADIENT TO ZERO
+    W_gradient.row(update_item).setZero();
+    }
+    */
+  }
+
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+                              const MatrixBase<DerivedIn> &input_words,
+                              double learning_rate,
+                              double L2_reg)
+  {
+    int embedding_dimension = W->cols();
+    //W_gradient.setZero(W->rows(), W->cols());
+    /*
       if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
-      */
-      for (int ngram=0; ngram<context_size; ngram++)
-      {
-          uscgemm(1.0,
-      USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-      bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-      W_gradient);
-      }
-      int_map update_map; //stores all the parameters that have been updated
-      for (int ngram=0; ngram<context_size; ngram++)
+      W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+    */
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      uscgemm(1.0,
+              USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+              W_gradient);
+    }
+    int_map update_map; //stores all the parameters that have been updated
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      for (int train_id=0; train_id<input_words.cols(); train_id++)
       {
-        for (int train_id=0; train_id<input_words.cols(); train_id++)
-        {
-          update_map[input_words(ngram,train_id)] = 1;
-        }
+        update_map[input_words(ngram,train_id)] = 1;
       }
+    }
 
-      // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-        {
-            update_items.push_back(it->first);
-        }
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
-            W->row(update_item) += learning_rate *
-              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
-            /*
-            //UPDATE CLIPPING
-            W->row(update_item) += (learning_rate *
-              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
-                      .unaryExpr(Clipper()).matrix();
-            */
-            W_gradient.row(update_item).setZero();
-        }
-    }
-
-    template <typename DerivedGOut, typename DerivedIn>
-    void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
-            const MatrixBase<DerivedIn> &input_words,
-            double learning_rate,
-            double L2_reg,
-            double conditioning_constant,
-            double decay)
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
     {
-      int embedding_dimension = W->cols();
+      update_items.push_back(it->first);
+    }
+    int num_items = update_items.size();
 
-      //W_gradient.setZero(W->rows(), W->cols());
+#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+      W->row(update_item) += learning_rate *
+          (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
       /*
-      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+      //UPDATE CLIPPING
+      W->row(update_item) += (learning_rate *
+      (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+      .unaryExpr(Clipper()).matrix();
       */
-      for (int ngram=0; ngram<context_size; ngram++)
-      {
-          uscgemm(1.0,
-      USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-      bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-      W_gradient);
-      }
-      int_map update_map; //stores all the parameters that have been updated
-      for (int ngram=0; ngram<context_size; ngram++)
+      W_gradient.row(update_item).setZero();
+    }
+  }
+
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+                               const MatrixBase<DerivedIn> &input_words,
+                               double learning_rate,
+                               double L2_reg,
+                               double conditioning_constant,
+                               double decay)
+  {
+    int embedding_dimension = W->cols();
+
+    //W_gradient.setZero(W->rows(), W->cols());
+    /*
+      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+      W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+    */
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      uscgemm(1.0,
+              USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+              W_gradient);
+    }
+    int_map update_map; //stores all the parameters that have been updated
+    for (int ngram=0; ngram<context_size; ngram++)
+    {
+      for (int train_id=0; train_id<input_words.cols(); train_id++)
       {
-        for (int train_id=0; train_id<input_words.cols(); train_id++)
-        {
-          update_map[input_words(ngram,train_id)] = 1;
-        }
+        update_map[input_words(ngram,train_id)] = 1;
       }
+    }
 
-      // Convert to std::vector for parallelization
-        std::vector<int> update_items;
-        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-        {
-            update_items.push_back(it->first);
-        }
-        int num_items = update_items.size();
-
-        #pragma omp parallel for
-        for (int item_id=0; item_id<num_items; item_id++)
-        {
-
-            Array<double,1,Dynamic> W_current_parameter_update;
-            int update_item = update_items[item_id];
-            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
-                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
-
-            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
-                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
-                                         W_gradient.row(update_item).array();
-
-            //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
-            //getchar();
-            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
-                                                         (1.-decay)*W_current_parameter_update.square().matrix();
-
-            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
-            //cerr<<"Input: After update, W is  "<<W->row(update_item)<<endl;
-            //getchar();
-            W_gradient.row(update_item).setZero();
-        }
-
-    }
-
-    template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-    void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
-      const MatrixBase<DerivedIn> &input_words,
-      int x, int minibatch_size,
-      const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+    // Convert to std::vector for parallelization
+    std::vector<int> update_items;
+    for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
     {
-      UNCONST(DerivedGW, gradient, my_gradient);
-            int embedding_dimension = W->cols();
-      my_gradient.setZero();
-      for (int ngram=0; ngram<context_size; ngram++)
-      uscgemm(1.0,
-        USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
-        bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
-        my_gradient);
+      update_items.push_back(it->first);
     }
+    int num_items = update_items.size();
+
+#pragma omp parallel for
+    for (int item_id=0; item_id<num_items; item_id++)
+    {
+
+      Array<double,1,Dynamic> W_current_parameter_update;
+      int update_item = update_items[item_id];
+      W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+          (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+      W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                    (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+          W_gradient.row(update_item).array();
+
+      //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+      //getchar();
+      W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+          (1.-decay)*W_current_parameter_update.square().matrix();
+
+      W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+      //cerr<<"Input: After update, W is  "<<W->row(update_item)<<endl;
+      //getchar();
+      W_gradient.row(update_item).setZero();
+    }
+
+  }
+
+  template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+                            const MatrixBase<DerivedIn> &input_words,
+                            int x, int minibatch_size,
+                            const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+  {
+    UNCONST(DerivedGW, gradient, my_gradient);
+    int embedding_dimension = W->cols();
+    my_gradient.setZero();
+    for (int ngram=0; ngram<context_size; ngram++)
+      uscgemm(1.0,
+              USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+              bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+              my_gradient);
+  }
 };
 
 } // namespace nplm
author	graehl <graehl@gmail.com>	2015-06-25 21:38:54 +0300
committer	graehl <graehl@gmail.com>	2015-06-25 22:13:53 +0300
commit	363c73cacf94d965a8759ae8b55f56d8c1c29bb1 (patch)
tree	dd281ad350c508d766327988152a900acc851b95
parent	5fbf9611d24b6fd80c5839fe547f1edb141fa162 (diff)