extern/ceres/internal/ceres/line_search_direction.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2015 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Author: sameeragarwal@google.com (Sameer Agarwal)

#include "ceres/line_search_direction.h"

#include <memory>

#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
#include "ceres/line_search_minimizer.h"
#include "ceres/low_rank_inverse_hessian.h"
#include "glog/logging.h"

namespace ceres {
namespace internal {

class CERES_NO_EXPORT SteepestDescent final : public LineSearchDirection {
 public:
  bool NextDirection(const LineSearchMinimizer::State& previous,
                     const LineSearchMinimizer::State& current,
                     Vector* search_direction) override {
    *search_direction = -current.gradient;
    return true;
  }
};

class CERES_NO_EXPORT NonlinearConjugateGradient final
    : public LineSearchDirection {
 public:
  NonlinearConjugateGradient(const NonlinearConjugateGradientType type,
                             const double function_tolerance)
      : type_(type), function_tolerance_(function_tolerance) {}

  bool NextDirection(const LineSearchMinimizer::State& previous,
                     const LineSearchMinimizer::State& current,
                     Vector* search_direction) override {
    double beta = 0.0;
    Vector gradient_change;
    switch (type_) {
      case FLETCHER_REEVES:
        beta = current.gradient_squared_norm / previous.gradient_squared_norm;
        break;
      case POLAK_RIBIERE:
        gradient_change = current.gradient - previous.gradient;
        beta = (current.gradient.dot(gradient_change) /
                previous.gradient_squared_norm);
        break;
      case HESTENES_STIEFEL:
        gradient_change = current.gradient - previous.gradient;
        beta = (current.gradient.dot(gradient_change) /
                previous.search_direction.dot(gradient_change));
        break;
      default:
        LOG(FATAL) << "Unknown nonlinear conjugate gradient type: " << type_;
    }

    *search_direction = -current.gradient + beta * previous.search_direction;
    const double directional_derivative =
        current.gradient.dot(*search_direction);
    if (directional_derivative > -function_tolerance_) {
      LOG(WARNING) << "Restarting non-linear conjugate gradients: "
                   << directional_derivative;
      *search_direction = -current.gradient;
    }

    return true;
  }

 private:
  const NonlinearConjugateGradientType type_;
  const double function_tolerance_;
};

class CERES_NO_EXPORT LBFGS final : public LineSearchDirection {
 public:
  LBFGS(const int num_parameters,
        const int max_lbfgs_rank,
        const bool use_approximate_eigenvalue_bfgs_scaling)
      : low_rank_inverse_hessian_(num_parameters,
                                  max_lbfgs_rank,
                                  use_approximate_eigenvalue_bfgs_scaling),
        is_positive_definite_(true) {}

  bool NextDirection(const LineSearchMinimizer::State& previous,
                     const LineSearchMinimizer::State& current,
                     Vector* search_direction) override {
    CHECK(is_positive_definite_)
        << "Ceres bug: NextDirection() called on L-BFGS after inverse Hessian "
        << "approximation has become indefinite, please contact the "
        << "developers!";

    low_rank_inverse_hessian_.Update(
        previous.search_direction * previous.step_size,
        current.gradient - previous.gradient);

    search_direction->setZero();
    low_rank_inverse_hessian_.RightMultiply(current.gradient.data(),
                                            search_direction->data());
    *search_direction *= -1.0;

    if (search_direction->dot(current.gradient) >= 0.0) {
      LOG(WARNING) << "Numerical failure in L-BFGS update: inverse Hessian "
                   << "approximation is not positive definite, and thus "
                   << "initial gradient for search direction is positive: "
                   << search_direction->dot(current.gradient);
      is_positive_definite_ = false;
      return false;
    }

    return true;
  }

 private:
  LowRankInverseHessian low_rank_inverse_hessian_;
  bool is_positive_definite_;
};

class CERES_NO_EXPORT BFGS final : public LineSearchDirection {
 public:
  BFGS(const int num_parameters, const bool use_approximate_eigenvalue_scaling)
      : num_parameters_(num_parameters),
        use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling),
        initialized_(false),
        is_positive_definite_(true) {
    if (num_parameters_ >= 1000) {
      LOG(WARNING) << "BFGS line search being created with: " << num_parameters_
                   << " parameters, this will allocate a dense approximate "
                   << "inverse Hessian of size: " << num_parameters_ << " x "
                   << num_parameters_
                   << ", consider using the L-BFGS memory-efficient line "
                   << "search direction instead.";
    }
    // Construct inverse_hessian_ after logging warning about size s.t. if the
    // allocation crashes us, the log will highlight what the issue likely was.
    inverse_hessian_ = Matrix::Identity(num_parameters, num_parameters);
  }

  bool NextDirection(const LineSearchMinimizer::State& previous,
                     const LineSearchMinimizer::State& current,
                     Vector* search_direction) override {
    CHECK(is_positive_definite_)
        << "Ceres bug: NextDirection() called on BFGS after inverse Hessian "
        << "approximation has become indefinite, please contact the "
        << "developers!";

    const Vector delta_x = previous.search_direction * previous.step_size;
    const Vector delta_gradient = current.gradient - previous.gradient;
    const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);

    // The (L)BFGS algorithm explicitly requires that the secant equation:
    //
    //   B_{k+1} * s_k = y_k
    //
    // Is satisfied at each iteration, where B_{k+1} is the approximated
    // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
    // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
    // positive definite, this is equivalent to the condition:
    //
    //   s_k^T * y_k > 0     [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
    //
    // This condition would always be satisfied if the function was strictly
    // convex, alternatively, it is always satisfied provided that a Wolfe line
    // search is used (even if the function is not strictly convex).  See [1]
    // (p138) for a proof.
    //
    // Although Ceres will always use a Wolfe line search when using (L)BFGS,
    // practical implementation considerations mean that the line search
    // may return a point that satisfies only the Armijo condition, and thus
    // could violate the Secant equation.  As such, we will only use a step
    // to update the Hessian approximation if:
    //
    //   s_k^T * y_k > tolerance
    //
    // It is important that tolerance is very small (and >=0), as otherwise we
    // might skip the update too often and fail to capture important curvature
    // information in the Hessian.  For example going from 1e-10 -> 1e-14
    // improves the NIST benchmark score from 43/54 to 53/54.
    //
    // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999.
    //
    // TODO(alexs.mac): Consider using Damped BFGS update instead of
    // skipping update.
    const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14;
    if (delta_x_dot_delta_gradient <=
        kBFGSSecantConditionHessianUpdateTolerance) {
      VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too "
              << "small: " << delta_x_dot_delta_gradient
              << ", tolerance: " << kBFGSSecantConditionHessianUpdateTolerance
              << " (Secant condition).";
    } else {
      // Update dense inverse Hessian approximation.

      if (!initialized_ && use_approximate_eigenvalue_scaling_) {
        // Rescale the initial inverse Hessian approximation (H_0) to be
        // iteratively updated so that it is of similar 'size' to the true
        // inverse Hessian at the start point.  As shown in [1]:
        //
        //   \gamma = (delta_gradient_{0}' * delta_x_{0}) /
        //            (delta_gradient_{0}' * delta_gradient_{0})
        //
        // Satisfies:
        //
        //   (1 / \lambda_m) <= \gamma <= (1 / \lambda_1)
        //
        // Where \lambda_1 & \lambda_m are the smallest and largest eigenvalues
        // of the true initial Hessian (not the inverse) respectively. Thus,
        // \gamma is an approximate eigenvalue of the true inverse Hessian, and
        // choosing: H_0 = I * \gamma will yield a starting point that has a
        // similar scale to the true inverse Hessian.  This technique is widely
        // reported to often improve convergence, however this is not
        // universally true, particularly if there are errors in the initial
        // gradients, or if there are significant differences in the sensitivity
        // of the problem to the parameters (i.e. the range of the magnitudes of
        // the components of the gradient is large).
        //
        // The original origin of this rescaling trick is somewhat unclear, the
        // earliest reference appears to be Oren [1], however it is widely
        // discussed without specific attributation in various texts including
        // [2] (p143).
        //
        // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms
        //     Part II: Implementation and experiments, Management Science,
        //     20(5), 863-874, 1974.
        // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999.
        const double approximate_eigenvalue_scale =
            delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient);
        inverse_hessian_ *= approximate_eigenvalue_scale;

        VLOG(4) << "Applying approximate_eigenvalue_scale: "
                << approximate_eigenvalue_scale << " to initial inverse "
                << "Hessian approximation.";
      }
      initialized_ = true;

      // Efficient O(num_parameters^2) BFGS update [2].
      //
      // Starting from dense BFGS update detailed in Nocedal [2] p140/177 and
      // using: y_k = delta_gradient, s_k = delta_x:
      //
      //   \rho_k = 1.0 / (s_k' * y_k)
      //   V_k = I - \rho_k * y_k * s_k'
      //   H_k = (V_k' * H_{k-1} * V_k) + (\rho_k * s_k * s_k')
      //
      // This update involves matrix, matrix products which naively O(N^3),
      // however we can exploit our knowledge that H_k is positive definite
      // and thus by defn. symmetric to reduce the cost of the update:
      //
      // Expanding the update above yields:
      //
      //   H_k = H_{k-1} +
      //         \rho_k * ( (1.0 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' -
      //                    (s_k * y_k' * H_k + H_k * y_k * s_k') )
      //
      // Using: A = (s_k * y_k' * H_k), and the knowledge that H_k = H_k', the
      // last term simplifies to (A + A'). Note that although A is not symmetric
      // (A + A') is symmetric. For ease of construction we also define
      // B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k', which is by defn
      // symmetric due to construction from: s_k * s_k'.
      //
      // Now we can write the BFGS update as:
      //
      //   H_k = H_{k-1} + \rho_k * (B - (A + A'))

      // For efficiency, as H_k is by defn. symmetric, we will only maintain the
      // *lower* triangle of H_k (and all intermediary terms).

      const double rho_k = 1.0 / delta_x_dot_delta_gradient;

      // Calculate: A = s_k * y_k' * H_k
      Matrix A = delta_x * (delta_gradient.transpose() *
                            inverse_hessian_.selfadjointView<Eigen::Lower>());

      // Calculate scalar: (1 + \rho_k * y_k' * H_k * y_k)
      const double delta_x_times_delta_x_transpose_scale_factor =
          (1.0 +
           (rho_k * delta_gradient.transpose() *
            inverse_hessian_.selfadjointView<Eigen::Lower>() * delta_gradient));
      // Calculate: B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k'
      Matrix B = Matrix::Zero(num_parameters_, num_parameters_);
      B.selfadjointView<Eigen::Lower>().rankUpdate(
          delta_x, delta_x_times_delta_x_transpose_scale_factor);

      // Finally, update inverse Hessian approximation according to:
      // H_k = H_{k-1} + \rho_k * (B - (A + A')).  Note that (A + A') is
      // symmetric, even though A is not.
      inverse_hessian_.triangularView<Eigen::Lower>() +=
          rho_k * (B - A - A.transpose());
    }

    *search_direction = inverse_hessian_.selfadjointView<Eigen::Lower>() *
                        (-1.0 * current.gradient);

    if (search_direction->dot(current.gradient) >= 0.0) {
      LOG(WARNING) << "Numerical failure in BFGS update: inverse Hessian "
                   << "approximation is not positive definite, and thus "
                   << "initial gradient for search direction is positive: "
                   << search_direction->dot(current.gradient);
      is_positive_definite_ = false;
      return false;
    }

    return true;
  }

 private:
  const int num_parameters_;
  const bool use_approximate_eigenvalue_scaling_;
  Matrix inverse_hessian_;
  bool initialized_;
  bool is_positive_definite_;
};

LineSearchDirection::~LineSearchDirection() = default;

std::unique_ptr<LineSearchDirection> LineSearchDirection::Create(
    const LineSearchDirection::Options& options) {
  if (options.type == STEEPEST_DESCENT) {
    return std::make_unique<SteepestDescent>();
  }

  if (options.type == NONLINEAR_CONJUGATE_GRADIENT) {
    return std::make_unique<NonlinearConjugateGradient>(
        options.nonlinear_conjugate_gradient_type, options.function_tolerance);
  }

  if (options.type == ceres::LBFGS) {
    return std::make_unique<ceres::internal::LBFGS>(
        options.num_parameters,
        options.max_lbfgs_rank,
        options.use_approximate_eigenvalue_bfgs_scaling);
  }

  if (options.type == ceres::BFGS) {
    return std::make_unique<ceres::internal::BFGS>(
        options.num_parameters,
        options.use_approximate_eigenvalue_bfgs_scaling);
  }

  LOG(ERROR) << "Unknown line search direction type: " << options.type;
  return nullptr;
}

}  // namespace internal
}  // namespace ceres