Attempt at SGD.sgd

Seems to be faster, need to test convergence more. Does not randomize order of examples. Apportions summed correct term over examples instead of using the one due to the example (this might be a bug or a feature)
author: Kenneth Heafield <github@kheafield.com> 2016-02-29 01:24:45 +0300
committer: Kenneth Heafield <github@kheafield.com> 2016-02-29 01:24:45 +0300
commit: 727ef9d6da5f3cb1a4bb03dfc85fd1058feaa785 (patch)
tree: d3db6f02c7242b7d3e556fb50d1483976e9dc965
parent: d7d122076b68c6c54e373444e43e267ba95b2213 (diff)
3 files changed, 71 insertions, 114 deletions
diff --git a/lm/interpolate/tune_derivatives.cc b/lm/interpolate/tune_derivatives.cc
index 1e6ac12..56b7cf0 100644
--- a/lm/interpolate/tune_derivatives.cc
+++ b/lm/interpolate/tune_derivatives.cc
@@ -9,32 +9,7 @@
 
 namespace lm { namespace interpolate {
 
-Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix &hessian) {
-  gradient = in.CorrectGradientTerm();
-  hessian = Matrix::Zero(weights.rows(), weights.rows());
-
-  // TODO: loop instead to force low-memory evaluation?
-  // Compute p_I(x).
-  Vector interp_uni((in.LNUnigrams() * weights).array().exp());
-  // Even -inf doesn't work for <s> because weights can be negative.  Manually set it to zero.
-  interp_uni(in.BOS()) = 0.0;
-  Accum Z_epsilon = interp_uni.sum();
-  interp_uni /= Z_epsilon;
-  // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x)
-  Vector unigram_cross(in.LNUnigrams().transpose() * interp_uni);
-
-  Accum sum_B_I = 0.0;
-  Accum sum_ln_Z_context = 0.0;
-
-  // Temporaries used each cycle of the loop.
-  Matrix convolve;
-  Vector full_cross;
-  Matrix hessian_missing_Z_context;
-  // Backed off ln p_i(x)B_i(context)
-  Vector ln_p_i_backed;
-  // Full ln p_i(x | context)
-  Vector ln_p_i_full;
-
+void Derivatives(Instances &in, Vector &weights) {
   // TODO make configurable memory size.
   util::stream::Chain chain(util::stream::ChainConfig(in.ReadExtensionsEntrySize(), 2, 64 << 20));
   chain.ActivateProgress();
@@ -42,87 +17,77 @@ Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix
   util::stream::TypedStream<Extension> extensions(chain.Add());
   chain >> util::stream::kRecycle;
 
-  // Loop over instances (words in the tuning data).
-  for (InstanceIndex n = 0; n < in.NumInstances(); ++n) {
-    assert(extensions);
-    Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights));
-
-    // Compute \sum_{x: model does not back off to unigram} p_I(x)
-    Accum sum_x_p_I = 0.0;
-    // Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context)
-    Accum unnormalized_sum_x_p_I_full = 0.0;
-
-    // This should be divided by Z_context then added to the Hessian.
-    hessian_missing_Z_context = Matrix::Zero(weights.rows(), weights.rows());
-
-    full_cross = Vector::Zero(weights.rows());
-
-    // Loop over words within an instance for which extension exists.  An extension happens when any model matches more than a unigram in the tuning instance.
-    while (extensions && extensions->instance == n) {
-      const WordIndex word = extensions->word;
-      sum_x_p_I += interp_uni(word);
-
-      ln_p_i_backed = in.LNUnigrams().row(word) + in.LNBackoffs(n);
-
-      // Calculate ln_p_i_full(i) = ln p_i(word | context) by filling in unigrams then overwriting with extensions.
-      ln_p_i_full = ln_p_i_backed;
-      // Loop over all models that have an extension for the same word namely p_i(word | context) matches at least a bigram.
-      for (; extensions && extensions->word == word && extensions->instance == n; ++extensions) {
-        ln_p_i_full(extensions->model) = extensions->ln_prob;
+  // Temporaries used each cycle of the loop.
+  Vector interp_uni;
+  Vector unigram_cross;
+  Vector full_cross;
+  // Full ln p_i(x | context)
+  Vector ln_p_i_full;
+  Vector gradient;
+
+  for (InstanceIndex n = 0; n < in.NumInstances(); ) {
+    InstanceIndex batch_end = std::min<InstanceIndex>(n + 100, in.NumInstances());
+    InstanceIndex batch_size = batch_end - n;
+    // Compute p_I(x).
+    interp_uni = (in.LNUnigrams() * weights).array().exp();
+    // Even -inf doesn't work for <s> because weights can be negative.  Manually set it to zero.
+    interp_uni(in.BOS()) = 0.0;
+    Accum Z_epsilon = interp_uni.sum();
+    interp_uni /= Z_epsilon;
+    // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x)
+    unigram_cross = in.LNUnigrams().transpose() * interp_uni;
+
+    gradient = in.CorrectGradientTerm() * (static_cast<float>(batch_size) / static_cast<float>(in.NumInstances()));
+
+    for (; n < batch_end; ++n) {
+      Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights));
+      // Compute \sum_{x: model does not back off to unigram} p_I(x)
+      Accum sum_x_p_I = 0.0;
+      // Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context)
+      Accum unnormalized_sum_x_p_I_full = 0.0;
+      full_cross = Vector::Zero(weights.rows());
+
+      // Loop over words within an instance for which extension exists.  An extension happens when any model matches more than a unigram in the tuning instance.
+      while (extensions && extensions->instance == n) {
+        const WordIndex word = extensions->word;
+        sum_x_p_I += interp_uni(word);
+
+        ln_p_i_full = in.LNUnigrams().row(word) + in.LNBackoffs(n);
+
+        // Calculate ln_p_i_full(i) = ln p_i(word | context) by filling in unigrams then overwriting with extensions.
+        // Loop over all models that have an extension for the same word namely p_i(word | context) matches at least a bigram.
+        for (; extensions && extensions->word == word && extensions->instance == n; ++extensions) {
+          ln_p_i_full(extensions->model) = extensions->ln_prob;
+        }
+
+        // This is the weighted product of probabilities.  In other words, p_I(word | context) * Z(context) = exp(\sum_i w_i * p_i(word | context)).
+        Accum weighted = exp(ln_p_i_full.dot(weights));
+        unnormalized_sum_x_p_I_full += weighted;
+
+        // These aren't normalized by Z_context (happens later)
+        full_cross.noalias() +=
+          weighted * ln_p_i_full
+          - interp_uni(word) * Z_epsilon * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose();
       }
 
-      // This is the weighted product of probabilities.  In other words, p_I(word | context) * Z(context) = exp(\sum_i w_i * p_i(word | context)).
-      Accum weighted = exp(ln_p_i_full.dot(weights));
-      unnormalized_sum_x_p_I_full += weighted;
-
-      // These aren't normalized by Z_context (happens later)
-      full_cross.noalias() +=
-        weighted * ln_p_i_full
-        - interp_uni(word) * Z_epsilon * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose();
-
-      // This will get multiplied by Z_context then added to the Hessian.
-      hessian_missing_Z_context.noalias() +=
-        // Replacement terms.
-        weighted * ln_p_i_full * ln_p_i_full.transpose()
-        // Presumed unigrams.  Z_epsilon * weighted_backoffs will turn into B_I once all of this is divided by Z_context.
-        - interp_uni(word) * Z_epsilon * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose();
+      Accum Z_context =
+        weighted_backoffs * Z_epsilon * (1.0 - sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension.
+        + unnormalized_sum_x_p_I_full; // Add the extensions.
+      Accum B_I = Z_epsilon / Z_context * weighted_backoffs;
+
+      // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_.
+      // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context)
+      // Prior terms excluded dividing by Z_context because it wasn't known at the time.
+      gradient.noalias() +=
+        full_cross / Z_context
+        // Uncorrected term
+        + B_I * (in.LNBackoffs(n).transpose() + unigram_cross)
+        // Subtract values that should not have been charged.
+        - sum_x_p_I * B_I * in.LNBackoffs(n).transpose();
     }
-
-    Accum Z_context =
-      weighted_backoffs * Z_epsilon * (1.0 - sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension.
-      + unnormalized_sum_x_p_I_full; // Add the extensions.
-    sum_ln_Z_context += log(Z_context);
-    Accum B_I = Z_epsilon / Z_context * weighted_backoffs;
-    sum_B_I += B_I;
-
-    // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_.
-    // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context)
-    // Prior terms excluded dividing by Z_context because it wasn't known at the time.
-    full_cross /= Z_context;
-    full_cross +=
-      // Uncorrected term
-      B_I * (in.LNBackoffs(n).transpose() + unigram_cross)
-      // Subtract values that should not have been charged.
-      - sum_x_p_I * B_I * in.LNBackoffs(n).transpose();
-    gradient += full_cross;
-
-    convolve = unigram_cross * in.LNBackoffs(n);
-    // There's one missing term here, which is independent of context and done at the end.
-    hessian.noalias() +=
-      // First term of Hessian, assuming all models back off to unigram.
-      B_I * (convolve + convolve.transpose() + in.LNBackoffs(n).transpose() * in.LNBackoffs(n))
-      // Error in the first term, correcting from unigram to full probabilities.
-      + hessian_missing_Z_context / Z_context
-      // Second term of Hessian, with correct full probabilities.
-      - full_cross * full_cross.transpose();
-  }
-
-  for (Matrix::Index x = 0; x < interp_uni.rows(); ++x) {
-    // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x)
-    // TODO can this be optimized?  It's summing over the entire vocab which should be a matrix operation.
-    hessian.noalias() += sum_B_I * interp_uni(x) * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x);
+    weights -= 0.01 * gradient / batch_size;
+    std::cerr << "Batch gradient is " << gradient.transpose() << " so new weights are " << weights.transpose() << std::endl;
   }
-  return exp((in.CorrectGradientTerm().dot(weights) + sum_ln_Z_context) / static_cast<double>(in.NumInstances()));
 }
 
 }} // namespaces
diff --git a/lm/interpolate/tune_derivatives.hh b/lm/interpolate/tune_derivatives.hh
index e3bedd3..27b5d8d 100644
--- a/lm/interpolate/tune_derivatives.hh
+++ b/lm/interpolate/tune_derivatives.hh
@@ -12,7 +12,7 @@ class Instances;
 
 // Given tuning instances and model weights, computes the objective function (log probability), gradient, and Hessian.
 // Returns log probability / number of instances.
-Accum Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, const Vector &weights, Vector &gradient, Matrix &hessian);
+void Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, Vector &weights);
 
 }} // namespaces
 
diff --git a/lm/interpolate/tune_weights.cc b/lm/interpolate/tune_weights.cc
index 72c93da..e0c5986 100644
--- a/lm/interpolate/tune_weights.cc
+++ b/lm/interpolate/tune_weights.cc
@@ -16,17 +16,9 @@ namespace lm { namespace interpolate {
 void TuneWeights(int tune_file, const std::vector<StringPiece> &model_names, const InstancesConfig &config, std::vector<float> &weights_out) {
   Instances instances(tune_file, model_names, config);
   Vector weights = Vector::Constant(model_names.size(), 1.0 / model_names.size());
-  Vector gradient;
-  Matrix hessian;
   for (std::size_t iteration = 0; iteration < 10 /*TODO fancy stopping criteria */; ++iteration) {
-    std::cerr << "Iteration " << iteration << ": weights =";
-    for (Vector::Index i = 0; i < weights.rows(); ++i) {
-      std::cerr << ' ' << weights(i);
-    }
-    std::cerr << std::endl;
-    std::cerr << "Perplexity = " << Derivatives(instances, weights, gradient, hessian) << std::endl;
-    // TODO: 1.0 step size was too big and it kept getting unstable.  More math.
-    weights -= 0.7 * hessian.inverse() * gradient;
+    std::cerr << "Iteration " << iteration << ": weights = " << weights.transpose() << std::endl;
+    Derivatives(instances, weights);
   }
   weights_out.assign(weights.data(), weights.data() + weights.size());
 }
author	Kenneth Heafield <github@kheafield.com>	2016-02-29 01:24:45 +0300
committer	Kenneth Heafield <github@kheafield.com>	2016-02-29 01:24:45 +0300
commit	727ef9d6da5f3cb1a4bb03dfc85fd1058feaa785 (patch)
tree	d3db6f02c7242b7d3e556fb50d1483976e9dc965
parent	d7d122076b68c6c54e373444e43e267ba95b2213 (diff)