1 files changed, 312 insertions, 40 deletions
diff --git a/extern/ceres/internal/ceres/schur_eliminator.h b/extern/ceres/internal/ceres/schur_eliminator.h
index 761b58adc7f..66fcb4d58e8 100644
--- a/extern/ceres/internal/ceres/schur_eliminator.h
+++ b/extern/ceres/internal/ceres/schur_eliminator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2019 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,14 +32,17 @@
 #define CERES_INTERNAL_SCHUR_ELIMINATOR_H_
 
 #include <map>
+#include <memory>
+#include <mutex>
 #include <vector>
-#include "ceres/mutex.h"
+
+#include "Eigen/Dense"
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
-#include "ceres/linear_solver.h"
 #include "ceres/internal/eigen.h"
-#include "ceres/internal/scoped_ptr.h"
+#include "ceres/internal/port.h"
+#include "ceres/linear_solver.h"
 
 namespace ceres {
 namespace internal {
@@ -50,7 +53,7 @@ namespace internal {
 //
 //  E y + F z = b
 //
-// Where x = [y;z] is a partition of the variables.  The paritioning
+// Where x = [y;z] is a partition of the variables.  The partitioning
 // of the variables is such that, E'E is a block diagonal matrix. Or
 // in other words, the parameter blocks in E form an independent set
 // of the of the graph implied by the block matrix A'A. Then, this
@@ -147,7 +150,7 @@ namespace internal {
 // Where the sum is over chunks and E_k'E_k is dense matrix of size y1
 // x y1.
 //
-// Advanced usage. Uptil now it has been assumed that the user would
+// Advanced usage. Until now it has been assumed that the user would
 // be interested in all of the Schur Complement S. However, it is also
 // possible to use this eliminator to obtain an arbitrary submatrix of
 // the full Schur complement. When the eliminator is generating the
@@ -171,7 +174,14 @@ class SchurEliminatorBase {
   // CompressedRowBlockStructure object passed to this method is the
   // same one (or is equivalent to) the one associated with the
   // BlockSparseMatrix objects below.
+  //
+  // assume_full_rank_ete controls how the eliminator inverts with the
+  // diagonal blocks corresponding to e blocks in A'A. If
+  // assume_full_rank_ete is true, then a Cholesky factorization is
+  // used to compute the inverse, otherwise a singular value
+  // decomposition is used to compute the pseudo inverse.
   virtual void Init(int num_eliminate_blocks,
+                    bool assume_full_rank_ete,
                     const CompressedRowBlockStructure* bs) = 0;
 
   // Compute the Schur complement system from the augmented linear
@@ -185,7 +195,7 @@ class SchurEliminatorBase {
   //
   // Since the Schur complement is a symmetric matrix, only the upper
   // triangular part of the Schur complement is computed.
-  virtual void Eliminate(const BlockSparseMatrix* A,
+  virtual void Eliminate(const BlockSparseMatrixData& A,
                          const double* b,
                          const double* D,
                          BlockRandomAccessMatrix* lhs,
@@ -194,7 +204,7 @@ class SchurEliminatorBase {
   // Given values for the variables z in the F block of A, solve for
   // the optimal values of the variables y corresponding to the E
   // block in A.
-  virtual void BackSubstitute(const BlockSparseMatrix* A,
+  virtual void BackSubstitute(const BlockSparseMatrixData& A,
                               const double* b,
                               const double* D,
                               const double* z,
@@ -210,32 +220,31 @@ class SchurEliminatorBase {
 // parameters as template arguments so that they are visible to the
 // compiler and can be used for compile time optimization of the low
 // level linear algebra routines.
-//
-// This implementation is mulithreaded using OpenMP. The level of
-// parallelism is controlled by LinearSolver::Options::num_threads.
 template <int kRowBlockSize = Eigen::Dynamic,
           int kEBlockSize = Eigen::Dynamic,
-          int kFBlockSize = Eigen::Dynamic >
+          int kFBlockSize = Eigen::Dynamic>
 class SchurEliminator : public SchurEliminatorBase {
  public:
   explicit SchurEliminator(const LinearSolver::Options& options)
-      : num_threads_(options.num_threads) {
+      : num_threads_(options.num_threads), context_(options.context) {
+    CHECK(context_ != nullptr);
   }
 
   // SchurEliminatorBase Interface
   virtual ~SchurEliminator();
-  virtual void Init(int num_eliminate_blocks,
-                    const CompressedRowBlockStructure* bs);
-  virtual void Eliminate(const BlockSparseMatrix* A,
-                         const double* b,
-                         const double* D,
-                         BlockRandomAccessMatrix* lhs,
-                         double* rhs);
-  virtual void BackSubstitute(const BlockSparseMatrix* A,
-                              const double* b,
-                              const double* D,
-                              const double* z,
-                              double* y);
+  void Init(int num_eliminate_blocks,
+            bool assume_full_rank_ete,
+            const CompressedRowBlockStructure* bs) final;
+  void Eliminate(const BlockSparseMatrixData& A,
+                 const double* b,
+                 const double* D,
+                 BlockRandomAccessMatrix* lhs,
+                 double* rhs) final;
+  void BackSubstitute(const BlockSparseMatrixData& A,
+                      const double* b,
+                      const double* D,
+                      const double* z,
+                      double* y) final;
 
  private:
   // Chunk objects store combinatorial information needed to
@@ -273,7 +282,7 @@ class SchurEliminator : public SchurEliminatorBase {
 
   void ChunkDiagonalBlockAndGradient(
       const Chunk& chunk,
-      const BlockSparseMatrix* A,
+      const BlockSparseMatrixData& A,
       const double* b,
       int row_block_counter,
       typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet,
@@ -282,33 +291,36 @@ class SchurEliminator : public SchurEliminatorBase {
       BlockRandomAccessMatrix* lhs);
 
   void UpdateRhs(const Chunk& chunk,
-                 const BlockSparseMatrix* A,
+                 const BlockSparseMatrixData& A,
                  const double* b,
                  int row_block_counter,
                  const double* inverse_ete_g,
                  double* rhs);
 
-  void ChunkOuterProduct(const CompressedRowBlockStructure* bs,
+  void ChunkOuterProduct(int thread_id,
+                         const CompressedRowBlockStructure* bs,
                          const Matrix& inverse_eet,
                          const double* buffer,
                          const BufferLayoutType& buffer_layout,
                          BlockRandomAccessMatrix* lhs);
-  void EBlockRowOuterProduct(const BlockSparseMatrix* A,
+  void EBlockRowOuterProduct(const BlockSparseMatrixData& A,
                              int row_block_index,
                              BlockRandomAccessMatrix* lhs);
 
+  void NoEBlockRowsUpdate(const BlockSparseMatrixData& A,
+                          const double* b,
+                          int row_block_counter,
+                          BlockRandomAccessMatrix* lhs,
+                          double* rhs);
 
-  void NoEBlockRowsUpdate(const BlockSparseMatrix* A,
-                             const double* b,
-                             int row_block_counter,
-                             BlockRandomAccessMatrix* lhs,
-                             double* rhs);
-
-  void NoEBlockRowOuterProduct(const BlockSparseMatrix* A,
+  void NoEBlockRowOuterProduct(const BlockSparseMatrixData& A,
                                int row_block_index,
                                BlockRandomAccessMatrix* lhs);
 
+  int num_threads_;
+  ContextImpl* context_;
   int num_eliminate_blocks_;
+  bool assume_full_rank_ete_;
 
   // Block layout of the columns of the reduced linear system. Since
   // the f blocks can be of varying size, this vector stores the
@@ -330,7 +342,7 @@ class SchurEliminator : public SchurEliminatorBase {
   //
   //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
   //
-  scoped_array<double> buffer_;
+  std::unique_ptr<double[]> buffer_;
 
   // Buffer to store per thread matrix matrix products used by
   // ChunkOuterProduct. Like buffer_ it is of size num_threads *
@@ -338,15 +350,275 @@ class SchurEliminator : public SchurEliminatorBase {
   //
   //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1]
   //
-  scoped_array<double> chunk_outer_product_buffer_;
+  std::unique_ptr<double[]> chunk_outer_product_buffer_;
 
   int buffer_size_;
-  int num_threads_;
   int uneliminated_row_begins_;
 
   // Locks for the blocks in the right hand side of the reduced linear
   // system.
-  std::vector<Mutex*> rhs_locks_;
+  std::vector<std::mutex*> rhs_locks_;
+};
+
+// SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for
+// the case where there is exactly one f-block and all three parameters
+// kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is
+// the case for some two view bundle adjustment problems which have very
+// stringent latency requirements.
+//
+// Under these assumptions, we can simplify the more general algorithm
+// implemented by SchurEliminatorImpl significantly. Two of the major
+// contributors to the increased performance are:
+//
+// 1. Simpler loop structure and less use of dynamic memory. Almost everything
+//    is on the stack and benefits from aligned memory as well as fixed sized
+//    vectorization. We are also able to reason about temporaries and control
+//    their lifetimes better.
+// 2. Use of inverse() over llt().solve(Identity).
+template <int kRowBlockSize = Eigen::Dynamic,
+          int kEBlockSize = Eigen::Dynamic,
+          int kFBlockSize = Eigen::Dynamic>
+class SchurEliminatorForOneFBlock : public SchurEliminatorBase {
+ public:
+  virtual ~SchurEliminatorForOneFBlock() {}
+  void Init(int num_eliminate_blocks,
+            bool assume_full_rank_ete,
+            const CompressedRowBlockStructure* bs) override {
+    CHECK_GT(num_eliminate_blocks, 0)
+        << "SchurComplementSolver cannot be initialized with "
+        << "num_eliminate_blocks = 0.";
+    CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1);
+
+    num_eliminate_blocks_ = num_eliminate_blocks;
+    const int num_row_blocks = bs->rows.size();
+    chunks_.clear();
+    int r = 0;
+    // Iterate over the row blocks of A, and detect the chunks. The
+    // matrix should already have been ordered so that all rows
+    // containing the same y block are vertically contiguous.
+    while (r < num_row_blocks) {
+      const int e_block_id = bs->rows[r].cells.front().block_id;
+      if (e_block_id >= num_eliminate_blocks_) {
+        break;
+      }
+
+      chunks_.push_back(Chunk());
+      Chunk& chunk = chunks_.back();
+      chunk.num_rows = 0;
+      chunk.start = r;
+      // Add to the chunk until the first block in the row is
+      // different than the one in the first row for the chunk.
+      while (r + chunk.num_rows < num_row_blocks) {
+        const CompressedRow& row = bs->rows[r + chunk.num_rows];
+        if (row.cells.front().block_id != e_block_id) {
+          break;
+        }
+        ++chunk.num_rows;
+      }
+      r += chunk.num_rows;
+    }
+
+    const Chunk& last_chunk = chunks_.back();
+    uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows;
+    e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size());
+    std::fill(
+        e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0);
+  }
+
+  void Eliminate(const BlockSparseMatrixData& A,
+                 const double* b,
+                 const double* D,
+                 BlockRandomAccessMatrix* lhs_bram,
+                 double* rhs_ptr) override {
+    // Since there is only one f-block, we can call GetCell once, and cache its
+    // output.
+    int r, c, row_stride, col_stride;
+    CellInfo* cell_info =
+        lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride);
+    typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs(
+        cell_info->values, kFBlockSize, kFBlockSize);
+    typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize);
+
+    lhs.setZero();
+    rhs.setZero();
+
+    const CompressedRowBlockStructure* bs = A.block_structure();
+    const double* values = A.values();
+
+    // Add the diagonal to the schur complement.
+    if (D != nullptr) {
+      typename EigenTypes<kFBlockSize>::ConstVectorRef diag(
+          D + bs->cols[num_eliminate_blocks_].position, kFBlockSize);
+      lhs.diagonal() = diag.array().square().matrix();
+    }
+
+    Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp;
+    Eigen::Matrix<double, kEBlockSize, 1> tmp2;
+
+    // The following loop works on a block matrix which looks as follows
+    // (number of rows can be anything):
+    //
+    // [e_1 | f_1] = [b1]
+    // [e_2 | f_2] = [b2]
+    // [e_3 | f_3] = [b3]
+    // [e_4 | f_4] = [b4]
+    //
+    // and computes the following
+    //
+    // e_t_e = sum_i e_i^T * e_i
+    // e_t_e_inverse = inverse(e_t_e)
+    // e_t_f = sum_i e_i^T f_i
+    // e_t_b = sum_i e_i^T b_i
+    // f_t_b = sum_i f_i^T b_i
+    //
+    // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f
+    // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b
+    for (int i = 0; i < chunks_.size(); ++i) {
+      const Chunk& chunk = chunks_[i];
+      const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+
+      // Naming covention, e_t_e = e_block.transpose() * e_block;
+      Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e;
+      Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f;
+      Eigen::Matrix<double, kEBlockSize, 1> e_t_b;
+      Eigen::Matrix<double, kFBlockSize, 1> f_t_b;
+
+      // Add the square of the diagonal to e_t_e.
+      if (D != NULL) {
+        const typename EigenTypes<kEBlockSize>::ConstVectorRef diag(
+            D + bs->cols[e_block_id].position, kEBlockSize);
+        e_t_e = diag.array().square().matrix().asDiagonal();
+      } else {
+        e_t_e.setZero();
+      }
+      e_t_f.setZero();
+      e_t_b.setZero();
+      f_t_b.setZero();
+
+      for (int j = 0; j < chunk.num_rows; ++j) {
+        const int row_id = chunk.start + j;
+        const auto& row = bs->rows[row_id];
+        const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
+            e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
+        const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
+            b + row.block.position, kRowBlockSize);
+
+        e_t_e.noalias() += e_block.transpose() * e_block;
+        e_t_b.noalias() += e_block.transpose() * b_block;
+
+        if (row.cells.size() == 1) {
+          // There is no f block, so there is nothing more to do.
+          continue;
+        }
+
+        const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
+            f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize);
+        e_t_f.noalias() += e_block.transpose() * f_block;
+        lhs.noalias() += f_block.transpose() * f_block;
+        f_t_b.noalias() += f_block.transpose() * b_block;
+      }
+
+      // BackSubstitute computes the same inverse, and this is the key workload
+      // there, so caching these inverses makes BackSubstitute essentially free.
+      typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
+          &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
+          kEBlockSize,
+          kEBlockSize);
+
+      // e_t_e is a symmetric positive definite matrix, so the standard way to
+      // compute its inverse is via the Cholesky factorization by calling
+      // e_t_e.llt().solve(Identity()). However, the inverse() method even
+      // though it is not optimized for symmetric matrices is significantly
+      // faster for small fixed size (up to 4x4) matrices.
+      //
+      // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3
+      e_t_e_inverse.noalias() = e_t_e.inverse();
+
+      // The use of these two pre-allocated tmp vectors saves temporaries in the
+      // expressions for lhs and rhs updates below and has a significant impact
+      // on the performance of this method.
+      tmp.noalias() = e_t_e_inverse * e_t_f;
+      tmp2.noalias() = e_t_e_inverse * e_t_b;
+
+      lhs.noalias() -= e_t_f.transpose() * tmp;
+      rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2;
+    }
+
+    // The rows without any e-blocks can have arbitrary size but only contain
+    // the f-block.
+    //
+    // lhs += f_i^T f_i
+    // rhs += f_i^T b_i
+    for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size();
+         ++row_id) {
+      const auto& row = bs->rows[row_id];
+      const auto& cell = row.cells[0];
+      const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef
+          f_block(values + cell.position, row.block.size, kFBlockSize);
+      const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block(
+          b + row.block.position, row.block.size);
+      lhs.noalias() += f_block.transpose() * f_block;
+      rhs.noalias() += f_block.transpose() * b_block;
+    }
+  }
+
+  // This implementation of BackSubstitute depends on Eliminate being called
+  // before this. SchurComplementSolver always does this.
+  //
+  // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z);
+  void BackSubstitute(const BlockSparseMatrixData& A,
+                      const double* b,
+                      const double* D,
+                      const double* z_ptr,
+                      double* y) override {
+    typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize);
+    const CompressedRowBlockStructure* bs = A.block_structure();
+    const double* values = A.values();
+    Eigen::Matrix<double, kEBlockSize, 1> tmp;
+    for (int i = 0; i < chunks_.size(); ++i) {
+      const Chunk& chunk = chunks_[i];
+      const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+      tmp.setZero();
+      for (int j = 0; j < chunk.num_rows; ++j) {
+        const int row_id = chunk.start + j;
+        const auto& row = bs->rows[row_id];
+        const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
+            e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
+        const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
+            b + row.block.position, kRowBlockSize);
+
+        if (row.cells.size() == 1) {
+          // There is no f block.
+          tmp += e_block.transpose() * b_block;
+        } else {
+          typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
+              f_block(
+                  values + row.cells[1].position, kRowBlockSize, kFBlockSize);
+          tmp += e_block.transpose() * (b_block - f_block * z);
+        }
+      }
+
+      typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
+          &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
+          kEBlockSize,
+          kEBlockSize);
+
+      typename EigenTypes<kEBlockSize>::VectorRef y_block(
+          y + bs->cols[e_block_id].position, kEBlockSize);
+      y_block.noalias() = e_t_e_inverse * tmp;
+    }
+  }
+
+ private:
+  struct Chunk {
+    int start = 0;
+    int num_rows = 0;
+  };
+
+  std::vector<Chunk> chunks_;
+  int num_eliminate_blocks_;
+  int uneliminated_row_begins_;
+  std::vector<double> e_t_e_inverse_matrices_;
 };
 
 }  // namespace internal