1 files changed, 633 insertions, 133 deletions
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 1b57763..949e445 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -7,7 +7,7 @@
 #include <vector>
 
 #include <boost/unordered_map.hpp> 
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 #include "maybe_omp.h"
 
 #include "util.h"
@@ -21,16 +21,26 @@
 //#define EIGEN_DONT_PARALLELIZE
 //#define EIGEN_DEFAULT_TO_ROW_MAJOR
 
+using namespace std;
 namespace nplm
 {
 
 // is this cheating?
 using Eigen::Matrix;
+using Eigen::Array;
 using Eigen::MatrixBase;
 using Eigen::Dynamic;
 
 typedef boost::unordered_map<int,bool> int_map;
 
+struct Clipper{
+  double operator() (double x) const { 
+    return std::min(0.5, std::max(x,-0.5));
+    //return(x);
+  }
+};
+
+
 class Linear_layer
 {
     private: 
@@ -38,6 +48,13 @@ class Linear_layer
         Matrix<double,Dynamic,Dynamic> U_gradient;
         Matrix<double,Dynamic,Dynamic> U_velocity;
         Matrix<double,Dynamic,Dynamic> U_running_gradient;
+        Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+        // Biases
+        Matrix<double,Dynamic,1> b;
+        Matrix<double,Dynamic,1> b_velocity;
+        Matrix<double,Dynamic,1> b_running_gradient;
+        Matrix<double,Dynamic,1> b_running_parameter_update;
+        Matrix<double,Dynamic,1> b_gradient;
 
     friend class model;
 
@@ -49,94 +66,222 @@ class Linear_layer
 	{
 	    U.setZero(rows, cols);
       U_gradient.setZero(rows, cols);
-      U_running_gradient.setZero(rows, cols);
-      U_velocity.setZero(rows, cols);
+      //U_running_gradient.setZero(rows, cols);
+      //U_running_parameter_updates.setZero(rows, cols);
+      //U_velocity.setZero(rows, cols);
+      b.resize(rows);
+      b_gradient.setZero(rows);
+      //b_running_gradient.resize(rows);
+      //b_velocity.resize(rows);
 	}
 
-	void read(std::ifstream &U_file) { readMatrix(U_file, U); }
-	void write(std::ofstream &U_file) { writeMatrix(U, U_file); }
+	void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
+	void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
+  void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+  void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
+
 
 	template <typename Engine>
-	void initialize(Engine &engine, bool init_normal, double init_range)
+	void initialize(Engine &engine,
+      bool init_normal,
+      double init_range,
+      string &parameter_update,
+      double adagrad_epsilon)
 	{
+      if (parameter_update == "ADA") {
+        U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+        b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+      }
+      if (parameter_update == "ADAD") {
+        U_running_gradient.setZero(U.rows(),U.cols());
+        b_running_gradient.setZero(b.size());
+        U_running_parameter_update.setZero(U.rows(),U.cols());
+        b_running_parameter_update.setZero(b.size());
+      }
+
 	    initMatrix(engine, U, init_normal, init_range);
+      initBias(engine, b, init_normal, init_range);
 	}	  
 
 	int n_inputs () const { return U.cols(); }
 	int n_outputs () const { return U.rows(); }
 
-        template <typename DerivedIn, typename DerivedOut>
-	void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
-        {
-	    UNCONST(DerivedOut, output, my_output);
-	    my_output.leftCols(input.cols()).noalias() = U*input;
-        }
+  template <typename DerivedIn, typename DerivedOut>
+	void fProp(const MatrixBase<DerivedIn> &input,
+      const MatrixBase<DerivedOut> &output) const
+  {
+      UNCONST(DerivedOut, output, my_output);
+      my_output.leftCols(input.cols()).noalias() = U*input;
+      int num_examples = input.cols();
+      for (int example = 0;example < num_examples;example++) 
+      {
+          my_output.leftCols(input.cols()).col(example) += b;
+      }
+  }
 
 	// Sparse input
   template <typename ScalarIn, typename DerivedOut>
-	void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const
+	void fProp(const USCMatrix<ScalarIn> &input,
+      const MatrixBase<DerivedOut> &output_const) const
   {
 	    UNCONST(DerivedOut, output_const, output);
 	    output.setZero();
 	    uscgemm(1.0, U, input, output.leftCols(input.cols()));
+      // Each column corresponds to a training example. We 
+      // parallelize the adding of biases per dimension.
+      int num_examples = input.cols();
+      for (int example = 0;example < num_examples;example++) 
+      {
+          output.leftCols(input.cols()).col(example) += b;
+      }
   }
 
-        template <typename DerivedGOut, typename DerivedGIn>
-	void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const
-        {
+  template <typename DerivedGOut, typename DerivedGIn>
+	void bProp(const MatrixBase<DerivedGOut> &input,
+      MatrixBase<DerivedGIn> &output) const
+  {
 	    UNCONST(DerivedGIn, output, my_output);
 	    my_output.noalias() = U.transpose()*input;
 	}
 
-      template <typename DerivedGOut, typename DerivedIn>
-      void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, 
-         const MatrixBase<DerivedIn> &fProp_input, 
-         double learning_rate, double momentum, double L2_reg)
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, 
+     const MatrixBase<DerivedIn> &fProp_input, 
+     double learning_rate, double momentum, double L2_reg)
+  {
+      U_gradient.noalias() = bProp_input*fProp_input.transpose();
+      
+      // get the bias gradient for all dimensions in parallel
+      int size = b.size();
+      b_gradient = bProp_input.rowwise().sum();
+      // This used to be multithreaded, but there was no measureable difference
+      if (L2_reg > 0.0)
       {
-	    U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
-	    // This used to be multithreaded, but there was no measureable difference
-	    if (L2_reg > 0.0)
-	    {
-	        U_gradient *= 1 - 2*L2_reg;
-	    }
-	    if (momentum > 0.0)
-	    {
-	        U_velocity = momentum*U_velocity + U_gradient;
-	        U += learning_rate * U_velocity;
-	    }
-	    else
-	    {
-	        U += learning_rate * U_gradient;
-	    }
+          U_gradient -=  2*L2_reg*U;
+          b_gradient -= 2*L2_reg*b;
+      }
+      if (momentum > 0.0)
+      {
+          U_velocity = momentum*U_velocity + U_gradient;
+          U += learning_rate * U_velocity;
+          b_velocity = momentum*b_velocity + b_gradient;
+          b += learning_rate * b_velocity;
+      }
+      else
+      {
+          U += learning_rate * U_gradient;
+          b += learning_rate * b_gradient;
+          /* 
+          //UPDATE CLIPPING
+          U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+          b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+          //GRADIENT CLIPPING
+          //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+          //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+          */
+      }
 	}
 
-        template <typename DerivedGOut, typename DerivedIn>
-        void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
-				    const MatrixBase<DerivedIn> &fProp_input, 
-				    double learning_rate, double momentum, double L2_reg)
-        {
-            U_gradient.noalias() = bProp_input*fProp_input.transpose();
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
+      const MatrixBase<DerivedIn> &fProp_input, 
+      double learning_rate,
+      double L2_reg)
+  {
+      U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-	    if (L2_reg != 0)
-	    {
-	        U_gradient *= 1 - 2*L2_reg;
-	    }
+      
+      // get the bias gradient for all dimensions in parallel
+      int size = b.size();
+      b_gradient.noalias() = bProp_input.rowwise().sum();
 
-	    // ignore momentum?
+      if (L2_reg != 0)
+      {
+          U_gradient -=  2*L2_reg*U;
+          b_gradient -= 2*L2_reg*b;
+      }
+
+      // ignore momentum?
+      #pragma omp parallel for
+      for (int col=0; col<U.cols(); col++) {
+        U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+        U.col(col) += learning_rate * (U_gradient.col(col).array() / 
+                  U_running_gradient.col(col).array().sqrt()).matrix();
+        /*
+        //UPDATE CLIPPING
+        U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+              unaryExpr(Clipper()).matrix();
+        */
+      }
+      b_running_gradient += b_gradient.array().square().matrix();
+      b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+      /*
+      //UPDATE CLIPPING
+      b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+      */
+  }
 
-	    U_running_gradient.array() += U_gradient.array().square();
-	    U.array() += learning_rate * U_gradient.array() / U_running_gradient.array().sqrt();
-        }
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, 
+      const MatrixBase<DerivedIn> &fProp_input, 
+      double learning_rate,
+      double L2_reg,
+      double conditioning_constant,
+      double decay)
+  {
+      //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+      U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-        template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-        void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
-				  const MatrixBase<DerivedIn> &fProp_input, 
-				  const MatrixBase<DerivedGW> &gradient) const
-        {
-	    UNCONST(DerivedGW, gradient, my_gradient);
-	    my_gradient.noalias() = bProp_input*fProp_input.transpose();
-        }
+      Array<double,Dynamic,1> b_current_parameter_update;
+      
+      // get the bias gradient for all dimensions in parallel
+      int size = b.size();
+      b_gradient.noalias() = bProp_input.rowwise().sum();
+
+      if (L2_reg != 0)
+      {
+          U_gradient -=  2*L2_reg*U;
+          b_gradient -= 2*L2_reg*b;
+      }
+
+      // ignore momentum?
+      #pragma omp parallel for
+      //cerr<<"U gradient is "<<U_gradient<<endl;
+      for (int col=0; col<U.cols(); col++) {
+        Array<double,Dynamic,1> U_current_parameter_update;
+        U_running_gradient.col(col) = decay*U_running_gradient.col(col) + 
+                            (1-decay)*U_gradient.col(col).array().square().matrix();
+        //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+        //getchar();
+        U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+                                      (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+                                      U_gradient.col(col).array();
+        //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+        //getchar();
+        //update the running parameter update
+        U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+                                          (1.-decay)*U_current_parameter_update.square().matrix();
+        U.col(col) += learning_rate*U_current_parameter_update.matrix();  
+      }
+      b_running_gradient = decay*b_running_gradient + 
+                        (1.-decay)*b_gradient.array().square().matrix();
+      b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                   (b_running_gradient.array()+conditioning_constant).sqrt()) *
+                                  b_gradient.array();
+      b_running_parameter_update = decay*(b_running_parameter_update) + 
+                                (1.-decay)*b_current_parameter_update.square().matrix();
+      b += learning_rate*b_current_parameter_update.matrix();
+  }
+
+
+  template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
+    const MatrixBase<DerivedIn> &fProp_input, 
+    const MatrixBase<DerivedGW> &gradient) const
+  {
+      UNCONST(DerivedGW, gradient, my_gradient);
+      my_gradient.noalias() = bProp_input*fProp_input.transpose();
+  }
 };
 
 class Output_word_embeddings
@@ -149,10 +294,12 @@ class Output_word_embeddings
         Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
         std::vector<double> W_data;
         Matrix<double,Dynamic,1> b;
-        Matrix<double,Dynamic,Dynamic> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic> W_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
         Matrix<double,Dynamic,1> b_running_gradient;
         Matrix<double,Dynamic,1> b_gradient;
+        Matrix<double,Dynamic,1> b_running_parameter_update;
 
     public:
         Output_word_embeddings() { }
@@ -160,8 +307,8 @@ class Output_word_embeddings
 
         void resize(int rows, int cols)
         {
-	    W->setZero(rows, cols);
-	    b.setZero(rows);
+          W->setZero(rows, cols);
+          b.setZero(rows);
         }
     void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
       W = input_W;
@@ -172,8 +319,31 @@ class Output_word_embeddings
     void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
 
     template <typename Engine>
-    void initialize(Engine &engine, bool init_normal, double init_range, double init_bias)
+    void initialize(Engine &engine,
+        bool init_normal,
+        double init_range,
+        double init_bias,
+        string &parameter_update,
+        double adagrad_epsilon)
     {
+
+        W_gradient.setZero(W->rows(),W->cols());
+        b_gradient.setZero(b.size());
+        if (parameter_update == "ADA") {
+          W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+          b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+          //W_gradient.setZero(W->rows(),W->cols());
+          //b_gradient.setZero(b.size());
+        }
+        if (parameter_update == "ADAD") {
+          W_running_gradient.setZero(W->rows(),W->cols());
+          b_running_gradient.setZero(b.size());
+          W_gradient.setZero(W->rows(),W->cols());
+          //b_gradient.setZero(b.size());
+          //W_running_parameter_update.setZero(W->rows(),W->cols());
+          b_running_parameter_update.setZero(b.size());
+        }
+
         initMatrix(engine, *W, init_normal, init_range);
         b.fill(init_bias);
     }
@@ -198,8 +368,12 @@ class Output_word_embeddings
         UNCONST(DerivedOutV, output, my_output);
         #pragma omp parallel for
         for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
-            for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
-          my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+        {
+          for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+          {
+            my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+          }
+        }
         USCMatrix<double> sparse_output(W->rows(), samples, my_output);
         uscgemm_masked(1.0, *W, input, sparse_output);
         my_output = sparse_output.values; // too bad, so much copying
@@ -232,15 +406,86 @@ class Output_word_embeddings
           void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
              const MatrixBase<DerivedGOut> &bProp_input,
              double learning_rate,
-             double momentum) //not sure if we want to use momentum here
+             double momentum) //not sure if we want 	to use momentum here
     {
         // W is vocab_size x output_embedding_dimension
         // b is vocab_size x 1
         // predicted_embeddings is output_embedding_dimension x minibatch_size
         // bProp_input is vocab_size x minibatch_size
-
         W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
         b += learning_rate * bProp_input.rowwise().sum();
+
+        /*
+        //GRADIENT CLIPPING
+        W->noalias() += learning_rate * 
+          ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+        b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+        //UPDATE CLIPPING
+        W->noalias() += (learning_rate * 
+        (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+        b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+        */
+	  }
+
+    template <typename DerivedIn, typename DerivedGOut>
+          void computeGradientAdagrad(
+             const MatrixBase<DerivedIn> &predicted_embeddings,
+             const MatrixBase<DerivedGOut> &bProp_input,
+             double learning_rate) //not sure if we want to use momentum here
+    {
+        // W is vocab_size x output_embedding_dimension
+        // b is vocab_size x 1
+        // predicted_embeddings is output_embedding_dimension x minibatch_size
+        // bProp_input is vocab_size x minibatch_sizea
+        W_gradient.setZero(W->rows(), W->cols());
+        b_gradient.setZero(b.size());
+        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+        b_gradient.noalias() = bProp_input.rowwise().sum();
+        W_running_gradient += W_gradient.array().square().matrix();
+        b_running_gradient += b_gradient.array().square().matrix();
+        W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+        b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+        /*
+        //UPDATE CLIPPING
+        *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+        b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+        */
+	  }
+
+    template <typename DerivedIn, typename DerivedGOut>
+          void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+             const MatrixBase<DerivedGOut> &bProp_input,
+             double learning_rate,
+             double conditioning_constant,
+             double decay) //not sure if we want to use momentum here
+    {
+        // W is vocab_size x output_embedding_dimension
+        // b is vocab_size x 1
+        // predicted_embeddings is output_embedding_dimension x minibatch_size
+        // bProp_input is vocab_size x minibatch_size
+        Array<double,Dynamic,Dynamic> W_current_parameter_update;
+        Array<double,Dynamic,1> b_current_parameter_update;
+        W_gradient.setZero(W->rows(), W->cols());
+        b_gradient.setZero(b.size());
+        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+        b_gradient.noalias() = bProp_input.rowwise().sum();
+        W_running_gradient = decay*W_running_gradient +
+                            (1.-decay)*W_gradient.array().square().matrix();
+        b_running_gradient = decay*b_running_gradient+
+                            (1.-decay)*b_gradient.array().square().matrix();
+        W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                     (W_running_gradient.array()+conditioning_constant).sqrt())*
+                                      W_gradient.array();
+        b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                     (b_running_gradient.array()+conditioning_constant).sqrt())*
+                                     b_gradient.array();
+        W_running_parameter_update = decay*W_running_parameter_update + 
+                                    (1.-decay)*W_current_parameter_update.square().matrix();
+        b_running_parameter_update = decay*b_running_parameter_update +
+                                    (1.-decay)*b_current_parameter_update.square().matrix();
+
+        *W += learning_rate*W_current_parameter_update.matrix();
+        b += learning_rate*b_current_parameter_update.matrix();
 	  }
 
     // Sparse versions
@@ -264,6 +509,7 @@ class Output_word_embeddings
 			     const MatrixBase<DerivedGOutV> &weights,
 			     double learning_rate, double momentum) //not sure if we want to use momentum here
 	{
+      //cerr<<"in gradient"<<endl;
 	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
 	    uscgemm(learning_rate,
           gradient_output,
@@ -273,27 +519,64 @@ class Output_word_embeddings
           gradient_output,
 		      Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
           b);
+      /*
+      //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+      //FIRST
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(1.0,
+          gradient_output,
+          predicted_embeddings.leftCols(samples.cols()).transpose(),
+          W_gradient);
+	    uscgemv(1.0, 
+          gradient_output,
+		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          b_gradient);
+
+      int_map update_map; //stores all the parameters that have been updated
+      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+	        for (int train_id=0; train_id<samples.cols(); train_id++)
+		          update_map[samples(sample_id, train_id)] = 1;
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+            update_items.push_back(it->first);
+        int num_items = update_items.size();
+
+        //#pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+            //b(update_item) += learning_rate * b_gradient(update_item);
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+            double update = learning_rate * b_gradient(update_item);
+            b(update_item) += std::min(0.5, std::max(update,-0.5));
+            //GRADIENT CLIPPING
+            W_gradient.row(update_item).setZero();
+            b_gradient(update_item) = 0.;
+        }
+        */
+      //cerr<<"Finished gradient"<<endl;
 	}
 
 	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
         void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
 				    const MatrixBase<DerivedGOutI> &samples,
 				    const MatrixBase<DerivedGOutV> &weights,
-				    double learning_rate, double momentum) //not sure if we want to use momentum here
+				    double learning_rate) //not sure if we want to use momentum here
         {
-	    W_gradient.setZero(W->rows(), W->cols());
-	    b_gradient.setZero(b.size());
-	    if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-	      W_running_gradient.setZero(W->rows(), W->cols());
-	    if (b_running_gradient.size() != b.size())
-	      b_running_gradient.setZero(b.size());
-
+	    //W_gradient.setZero(W->rows(), W->cols());
+	    //b_gradient.setZero(b.size());
+      //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
 	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(learning_rate,
+	    uscgemm(1.0,
           gradient_output,
           predicted_embeddings.leftCols(samples.cols()).transpose(),
           W_gradient);
-	    uscgemv(learning_rate, gradient_output,
+	    uscgemv(1.0, 
+          gradient_output,
 		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
           b_gradient);
 
@@ -308,16 +591,98 @@ class Output_word_embeddings
             update_items.push_back(it->first);
         int num_items = update_items.size();
 
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for (int item_id=0; item_id<num_items; item_id++)
         {
             int update_item = update_items[item_id];
-            W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
+            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
             b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
-            W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
+            W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
             b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+            /*
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+            double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+            b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+            */
+            W_gradient.row(update_item).setZero();
+            b_gradient(update_item) = 0.;
         }
+      }
+
+	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+        void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+				    const MatrixBase<DerivedGOutI> &samples,
+				    const MatrixBase<DerivedGOutV> &weights,
+				    double learning_rate,
+            double conditioning_constant,
+            double decay) //not sure if we want to use momentum here
+        {
+          //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+	    //W_gradient.setZero(W->rows(), W->cols());
+	    //b_gradient.setZero(b.size());
+
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(1.0,
+          gradient_output,
+          predicted_embeddings.leftCols(samples.cols()).transpose(),
+          W_gradient);
+	    uscgemv(1.0, 
+          gradient_output,
+		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          b_gradient);
+
+      int_map update_map; //stores all the parameters that have been updated
+      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+	        for (int train_id=0; train_id<samples.cols(); train_id++)
+		          update_map[samples(sample_id, train_id)] = 1;
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+            update_items.push_back(it->first);
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            Array<double,1,Dynamic> W_current_parameter_update;
+            double b_current_parameter_update;
+
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+            b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+                                            (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+            //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+            //getchar();
+
+            //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+            //getchar();
+            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+                                         W_gradient.row(update_item).array();
+            b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+                                         sqrt(b_running_gradient(update_item)+conditioning_constant))*
+                                         b_gradient(update_item);
+            //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+            //getchar();
+            //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+            //getchar();
+            //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+                                                         (1.-decay)*(W_current_parameter_update.square().matrix());
+            b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+                                                      (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+            //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+            //getchar();
+            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+            b(update_item) += learning_rate*b_current_parameter_update;
+            W_gradient.row(update_item).setZero();
+            b_gradient(update_item) = 0.;
         }
+      }
+
 
 	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
     void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
@@ -345,8 +710,9 @@ class Input_word_embeddings
     private:
         Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
         int context_size, vocab_size;
-        Matrix<double,Dynamic,Dynamic> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic> W_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
 
 	friend class model;
 
@@ -354,29 +720,44 @@ class Input_word_embeddings
         Input_word_embeddings() : context_size(0), vocab_size(0) { }
         Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
  
-    void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
-      W = input_W;
-    }
+      void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+        W = input_W;
+      }
 
-        void resize(int rows, int cols, int context)
-        {
-            context_size = context;
-	    vocab_size = rows;
-            W->setZero(rows, cols);
-        }
+      void resize(int rows, int cols, int context)
+      {
+        context_size = context;
+        vocab_size = rows;
+        W->setZero(rows, cols);
+      }
 
         void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
         void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
 
-	template <typename Engine>
-	void initialize(Engine &engine, bool init_normal, double init_range)
-        {
-            initMatrix(engine,
-                *W,
-                init_normal,
-                init_range);
+      template <typename Engine>
+      void initialize(Engine &engine,
+          bool init_normal,
+          double init_range,
+          string &parameter_update,
+          double adagrad_epsilon)
+      {
+          W_gradient.setZero(W->rows(),W->cols());
+
+          if (parameter_update == "ADA") {
+            W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+            //W_gradient.setZero(W->rows(),W->cols());
+          } 
+        if (parameter_update == "ADAD") {
+          W_running_gradient.setZero(W->rows(),W->cols());
+          //W_gradient.setZero(W->rows(),W->cols());
+          W_running_parameter_update.setZero(W->rows(),W->cols());
         }
-	
+        initMatrix(engine,
+            *W,
+            init_normal,
+            init_range);
+      }
+
 	int n_inputs() const { return -1; }
 	int n_outputs() const { return W->cols() * context_size; }
 
@@ -436,7 +817,7 @@ class Input_word_embeddings
      const MatrixBase<DerivedIn> &input_words,
      double learning_rate, double momentum, double L2_reg)
   {
-            int embedding_dimension = W->cols();
+      int embedding_dimension = W->cols();
 
 	    // W           is vocab_size                        x embedding_dimension
 	    // input       is ngram_size*vocab_size             x minibatch_size
@@ -453,59 +834,177 @@ class Input_word_embeddings
 	        uscgemm(learning_rate, 
 			USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
 			bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
-      *W);
+      	  	*W);
+	    }
+
+      /*
+      //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+      //PERFORM CLIPPING WHILE UPDATING
+
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	      uscgemm(1.0, 
+          USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+          bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+          W_gradient);
 	    }
+      int_map update_map; //stores all the parameters that have been updated
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+        for (int train_id=0; train_id<input_words.cols(); train_id++)
+        {
+          update_map[input_words(ngram,train_id)] = 1;
+        }
+      }
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+        {
+            update_items.push_back(it->first);
+        }
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate*
+                W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+            //GRADIENT CLIPPING
+            //W->row(update_item) += learning_rate*
+            //    W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+            //SETTING THE GRADIENT TO ZERO
+            W_gradient.row(update_item).setZero();
+        }
+      */
   }
 
     template <typename DerivedGOut, typename DerivedIn>
     void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
 				    const MatrixBase<DerivedIn> &input_words,
-				    double learning_rate, double momentum, double L2_reg)
+				    double learning_rate,
+            double L2_reg)
     {
             int embedding_dimension = W->cols();
-
-	    W_gradient.setZero(W->rows(), W->cols());
-	    if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-	        W_running_gradient.setZero(W->rows(), W->cols());
-
+	    //W_gradient.setZero(W->rows(), W->cols());
+      /*
+      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+      */
 	    for (int ngram=0; ngram<context_size; ngram++)
 	    {
-	        uscgemm(learning_rate, 
+	        uscgemm(1.0, 
 			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
 			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
       W_gradient);
 	    }
+      int_map update_map; //stores all the parameters that have been updated
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+        for (int train_id=0; train_id<input_words.cols(); train_id++)
+        {
+          update_map[input_words(ngram,train_id)] = 1;
+        }
+      }
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+        {
+            update_items.push_back(it->first);
+        }
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+            W->row(update_item) += learning_rate * 
+              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+            /*
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate * 
+              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+                      .unaryExpr(Clipper()).matrix();
+            */
+            W_gradient.row(update_item).setZero();
+        }
+    }
 
-            int_map update_map; //stores all the parameters that have been updated
+    template <typename DerivedGOut, typename DerivedIn>
+    void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+				    const MatrixBase<DerivedIn> &input_words,
+				    double learning_rate,
+            double L2_reg,
+            double conditioning_constant,
+            double decay)
+    {
+      int embedding_dimension = W->cols();
 
-            for (int train_id=0; train_id<input_words.cols(); train_id++)
-            {
-                update_map[input_words(train_id)] = 1;
-            }
+	    //W_gradient.setZero(W->rows(), W->cols());
+      /*
+      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+      */
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	        uscgemm(1.0, 
+			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+      W_gradient);
+	    }
+      int_map update_map; //stores all the parameters that have been updated
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+        for (int train_id=0; train_id<input_words.cols(); train_id++)
+        {
+          update_map[input_words(ngram,train_id)] = 1;
+        }
+      }
 
 	    // Convert to std::vector for parallelization
-            std::vector<int> update_items;
-            for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            {
-                update_items.push_back(it->first);
-            }
-            int num_items = update_items.size();
-
-            #pragma omp parallel for
-            for (int item_id=0; item_id<num_items; item_id++)
-            {
-	        int update_item = update_items[item_id];
-                W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
-                W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
-            }
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+        {
+            update_items.push_back(it->first);
         }
+        int num_items = update_items.size();
 
-        template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-        void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
-				  const MatrixBase<DerivedIn> &input_words,
-				  int x, int minibatch_size,
-				  const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
         {
+
+            Array<double,1,Dynamic> W_current_parameter_update;
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+                                         W_gradient.row(update_item).array();
+
+            //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+            //getchar();
+            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+                                                         (1.-decay)*W_current_parameter_update.square().matrix();
+
+            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+            //cerr<<"Input: After update, W is  "<<W->row(update_item)<<endl;
+            //getchar();
+            W_gradient.row(update_item).setZero();
+        }
+
+    }
+
+    template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+    void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+      const MatrixBase<DerivedIn> &input_words,
+      int x, int minibatch_size,
+      const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+    {
 	    UNCONST(DerivedGW, gradient, my_gradient);
             int embedding_dimension = W->cols();
 	    my_gradient.setZero();
@@ -514,7 +1013,8 @@ class Input_word_embeddings
 			  USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
 			  bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
         my_gradient);
-        }
+    }
 };
 
 } // namespace nplm
+