diff options
Diffstat (limited to 'extern/ceres/internal/ceres/schur_eliminator.h')
-rw-r--r-- | extern/ceres/internal/ceres/schur_eliminator.h | 352 |
1 files changed, 312 insertions, 40 deletions
diff --git a/extern/ceres/internal/ceres/schur_eliminator.h b/extern/ceres/internal/ceres/schur_eliminator.h index 761b58adc7f..66fcb4d58e8 100644 --- a/extern/ceres/internal/ceres/schur_eliminator.h +++ b/extern/ceres/internal/ceres/schur_eliminator.h @@ -1,5 +1,5 @@ // Ceres Solver - A fast non-linear least squares minimizer -// Copyright 2015 Google Inc. All rights reserved. +// Copyright 2019 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without @@ -32,14 +32,17 @@ #define CERES_INTERNAL_SCHUR_ELIMINATOR_H_ #include <map> +#include <memory> +#include <mutex> #include <vector> -#include "ceres/mutex.h" + +#include "Eigen/Dense" #include "ceres/block_random_access_matrix.h" #include "ceres/block_sparse_matrix.h" #include "ceres/block_structure.h" -#include "ceres/linear_solver.h" #include "ceres/internal/eigen.h" -#include "ceres/internal/scoped_ptr.h" +#include "ceres/internal/port.h" +#include "ceres/linear_solver.h" namespace ceres { namespace internal { @@ -50,7 +53,7 @@ namespace internal { // // E y + F z = b // -// Where x = [y;z] is a partition of the variables. The paritioning +// Where x = [y;z] is a partition of the variables. The partitioning // of the variables is such that, E'E is a block diagonal matrix. Or // in other words, the parameter blocks in E form an independent set // of the of the graph implied by the block matrix A'A. Then, this @@ -147,7 +150,7 @@ namespace internal { // Where the sum is over chunks and E_k'E_k is dense matrix of size y1 // x y1. // -// Advanced usage. Uptil now it has been assumed that the user would +// Advanced usage. Until now it has been assumed that the user would // be interested in all of the Schur Complement S. However, it is also // possible to use this eliminator to obtain an arbitrary submatrix of // the full Schur complement. When the eliminator is generating the @@ -171,7 +174,14 @@ class SchurEliminatorBase { // CompressedRowBlockStructure object passed to this method is the // same one (or is equivalent to) the one associated with the // BlockSparseMatrix objects below. + // + // assume_full_rank_ete controls how the eliminator inverts with the + // diagonal blocks corresponding to e blocks in A'A. If + // assume_full_rank_ete is true, then a Cholesky factorization is + // used to compute the inverse, otherwise a singular value + // decomposition is used to compute the pseudo inverse. virtual void Init(int num_eliminate_blocks, + bool assume_full_rank_ete, const CompressedRowBlockStructure* bs) = 0; // Compute the Schur complement system from the augmented linear @@ -185,7 +195,7 @@ class SchurEliminatorBase { // // Since the Schur complement is a symmetric matrix, only the upper // triangular part of the Schur complement is computed. - virtual void Eliminate(const BlockSparseMatrix* A, + virtual void Eliminate(const BlockSparseMatrixData& A, const double* b, const double* D, BlockRandomAccessMatrix* lhs, @@ -194,7 +204,7 @@ class SchurEliminatorBase { // Given values for the variables z in the F block of A, solve for // the optimal values of the variables y corresponding to the E // block in A. - virtual void BackSubstitute(const BlockSparseMatrix* A, + virtual void BackSubstitute(const BlockSparseMatrixData& A, const double* b, const double* D, const double* z, @@ -210,32 +220,31 @@ class SchurEliminatorBase { // parameters as template arguments so that they are visible to the // compiler and can be used for compile time optimization of the low // level linear algebra routines. -// -// This implementation is mulithreaded using OpenMP. The level of -// parallelism is controlled by LinearSolver::Options::num_threads. template <int kRowBlockSize = Eigen::Dynamic, int kEBlockSize = Eigen::Dynamic, - int kFBlockSize = Eigen::Dynamic > + int kFBlockSize = Eigen::Dynamic> class SchurEliminator : public SchurEliminatorBase { public: explicit SchurEliminator(const LinearSolver::Options& options) - : num_threads_(options.num_threads) { + : num_threads_(options.num_threads), context_(options.context) { + CHECK(context_ != nullptr); } // SchurEliminatorBase Interface virtual ~SchurEliminator(); - virtual void Init(int num_eliminate_blocks, - const CompressedRowBlockStructure* bs); - virtual void Eliminate(const BlockSparseMatrix* A, - const double* b, - const double* D, - BlockRandomAccessMatrix* lhs, - double* rhs); - virtual void BackSubstitute(const BlockSparseMatrix* A, - const double* b, - const double* D, - const double* z, - double* y); + void Init(int num_eliminate_blocks, + bool assume_full_rank_ete, + const CompressedRowBlockStructure* bs) final; + void Eliminate(const BlockSparseMatrixData& A, + const double* b, + const double* D, + BlockRandomAccessMatrix* lhs, + double* rhs) final; + void BackSubstitute(const BlockSparseMatrixData& A, + const double* b, + const double* D, + const double* z, + double* y) final; private: // Chunk objects store combinatorial information needed to @@ -273,7 +282,7 @@ class SchurEliminator : public SchurEliminatorBase { void ChunkDiagonalBlockAndGradient( const Chunk& chunk, - const BlockSparseMatrix* A, + const BlockSparseMatrixData& A, const double* b, int row_block_counter, typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet, @@ -282,33 +291,36 @@ class SchurEliminator : public SchurEliminatorBase { BlockRandomAccessMatrix* lhs); void UpdateRhs(const Chunk& chunk, - const BlockSparseMatrix* A, + const BlockSparseMatrixData& A, const double* b, int row_block_counter, const double* inverse_ete_g, double* rhs); - void ChunkOuterProduct(const CompressedRowBlockStructure* bs, + void ChunkOuterProduct(int thread_id, + const CompressedRowBlockStructure* bs, const Matrix& inverse_eet, const double* buffer, const BufferLayoutType& buffer_layout, BlockRandomAccessMatrix* lhs); - void EBlockRowOuterProduct(const BlockSparseMatrix* A, + void EBlockRowOuterProduct(const BlockSparseMatrixData& A, int row_block_index, BlockRandomAccessMatrix* lhs); + void NoEBlockRowsUpdate(const BlockSparseMatrixData& A, + const double* b, + int row_block_counter, + BlockRandomAccessMatrix* lhs, + double* rhs); - void NoEBlockRowsUpdate(const BlockSparseMatrix* A, - const double* b, - int row_block_counter, - BlockRandomAccessMatrix* lhs, - double* rhs); - - void NoEBlockRowOuterProduct(const BlockSparseMatrix* A, + void NoEBlockRowOuterProduct(const BlockSparseMatrixData& A, int row_block_index, BlockRandomAccessMatrix* lhs); + int num_threads_; + ContextImpl* context_; int num_eliminate_blocks_; + bool assume_full_rank_ete_; // Block layout of the columns of the reduced linear system. Since // the f blocks can be of varying size, this vector stores the @@ -330,7 +342,7 @@ class SchurEliminator : public SchurEliminatorBase { // // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_] // - scoped_array<double> buffer_; + std::unique_ptr<double[]> buffer_; // Buffer to store per thread matrix matrix products used by // ChunkOuterProduct. Like buffer_ it is of size num_threads * @@ -338,15 +350,275 @@ class SchurEliminator : public SchurEliminatorBase { // // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1] // - scoped_array<double> chunk_outer_product_buffer_; + std::unique_ptr<double[]> chunk_outer_product_buffer_; int buffer_size_; - int num_threads_; int uneliminated_row_begins_; // Locks for the blocks in the right hand side of the reduced linear // system. - std::vector<Mutex*> rhs_locks_; + std::vector<std::mutex*> rhs_locks_; +}; + +// SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for +// the case where there is exactly one f-block and all three parameters +// kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is +// the case for some two view bundle adjustment problems which have very +// stringent latency requirements. +// +// Under these assumptions, we can simplify the more general algorithm +// implemented by SchurEliminatorImpl significantly. Two of the major +// contributors to the increased performance are: +// +// 1. Simpler loop structure and less use of dynamic memory. Almost everything +// is on the stack and benefits from aligned memory as well as fixed sized +// vectorization. We are also able to reason about temporaries and control +// their lifetimes better. +// 2. Use of inverse() over llt().solve(Identity). +template <int kRowBlockSize = Eigen::Dynamic, + int kEBlockSize = Eigen::Dynamic, + int kFBlockSize = Eigen::Dynamic> +class SchurEliminatorForOneFBlock : public SchurEliminatorBase { + public: + virtual ~SchurEliminatorForOneFBlock() {} + void Init(int num_eliminate_blocks, + bool assume_full_rank_ete, + const CompressedRowBlockStructure* bs) override { + CHECK_GT(num_eliminate_blocks, 0) + << "SchurComplementSolver cannot be initialized with " + << "num_eliminate_blocks = 0."; + CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1); + + num_eliminate_blocks_ = num_eliminate_blocks; + const int num_row_blocks = bs->rows.size(); + chunks_.clear(); + int r = 0; + // Iterate over the row blocks of A, and detect the chunks. The + // matrix should already have been ordered so that all rows + // containing the same y block are vertically contiguous. + while (r < num_row_blocks) { + const int e_block_id = bs->rows[r].cells.front().block_id; + if (e_block_id >= num_eliminate_blocks_) { + break; + } + + chunks_.push_back(Chunk()); + Chunk& chunk = chunks_.back(); + chunk.num_rows = 0; + chunk.start = r; + // Add to the chunk until the first block in the row is + // different than the one in the first row for the chunk. + while (r + chunk.num_rows < num_row_blocks) { + const CompressedRow& row = bs->rows[r + chunk.num_rows]; + if (row.cells.front().block_id != e_block_id) { + break; + } + ++chunk.num_rows; + } + r += chunk.num_rows; + } + + const Chunk& last_chunk = chunks_.back(); + uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows; + e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size()); + std::fill( + e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0); + } + + void Eliminate(const BlockSparseMatrixData& A, + const double* b, + const double* D, + BlockRandomAccessMatrix* lhs_bram, + double* rhs_ptr) override { + // Since there is only one f-block, we can call GetCell once, and cache its + // output. + int r, c, row_stride, col_stride; + CellInfo* cell_info = + lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride); + typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs( + cell_info->values, kFBlockSize, kFBlockSize); + typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize); + + lhs.setZero(); + rhs.setZero(); + + const CompressedRowBlockStructure* bs = A.block_structure(); + const double* values = A.values(); + + // Add the diagonal to the schur complement. + if (D != nullptr) { + typename EigenTypes<kFBlockSize>::ConstVectorRef diag( + D + bs->cols[num_eliminate_blocks_].position, kFBlockSize); + lhs.diagonal() = diag.array().square().matrix(); + } + + Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp; + Eigen::Matrix<double, kEBlockSize, 1> tmp2; + + // The following loop works on a block matrix which looks as follows + // (number of rows can be anything): + // + // [e_1 | f_1] = [b1] + // [e_2 | f_2] = [b2] + // [e_3 | f_3] = [b3] + // [e_4 | f_4] = [b4] + // + // and computes the following + // + // e_t_e = sum_i e_i^T * e_i + // e_t_e_inverse = inverse(e_t_e) + // e_t_f = sum_i e_i^T f_i + // e_t_b = sum_i e_i^T b_i + // f_t_b = sum_i f_i^T b_i + // + // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f + // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b + for (int i = 0; i < chunks_.size(); ++i) { + const Chunk& chunk = chunks_[i]; + const int e_block_id = bs->rows[chunk.start].cells.front().block_id; + + // Naming covention, e_t_e = e_block.transpose() * e_block; + Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e; + Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f; + Eigen::Matrix<double, kEBlockSize, 1> e_t_b; + Eigen::Matrix<double, kFBlockSize, 1> f_t_b; + + // Add the square of the diagonal to e_t_e. + if (D != NULL) { + const typename EigenTypes<kEBlockSize>::ConstVectorRef diag( + D + bs->cols[e_block_id].position, kEBlockSize); + e_t_e = diag.array().square().matrix().asDiagonal(); + } else { + e_t_e.setZero(); + } + e_t_f.setZero(); + e_t_b.setZero(); + f_t_b.setZero(); + + for (int j = 0; j < chunk.num_rows; ++j) { + const int row_id = chunk.start + j; + const auto& row = bs->rows[row_id]; + const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef + e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); + const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block( + b + row.block.position, kRowBlockSize); + + e_t_e.noalias() += e_block.transpose() * e_block; + e_t_b.noalias() += e_block.transpose() * b_block; + + if (row.cells.size() == 1) { + // There is no f block, so there is nothing more to do. + continue; + } + + const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef + f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize); + e_t_f.noalias() += e_block.transpose() * f_block; + lhs.noalias() += f_block.transpose() * f_block; + f_t_b.noalias() += f_block.transpose() * b_block; + } + + // BackSubstitute computes the same inverse, and this is the key workload + // there, so caching these inverses makes BackSubstitute essentially free. + typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse( + &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], + kEBlockSize, + kEBlockSize); + + // e_t_e is a symmetric positive definite matrix, so the standard way to + // compute its inverse is via the Cholesky factorization by calling + // e_t_e.llt().solve(Identity()). However, the inverse() method even + // though it is not optimized for symmetric matrices is significantly + // faster for small fixed size (up to 4x4) matrices. + // + // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3 + e_t_e_inverse.noalias() = e_t_e.inverse(); + + // The use of these two pre-allocated tmp vectors saves temporaries in the + // expressions for lhs and rhs updates below and has a significant impact + // on the performance of this method. + tmp.noalias() = e_t_e_inverse * e_t_f; + tmp2.noalias() = e_t_e_inverse * e_t_b; + + lhs.noalias() -= e_t_f.transpose() * tmp; + rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2; + } + + // The rows without any e-blocks can have arbitrary size but only contain + // the f-block. + // + // lhs += f_i^T f_i + // rhs += f_i^T b_i + for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size(); + ++row_id) { + const auto& row = bs->rows[row_id]; + const auto& cell = row.cells[0]; + const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef + f_block(values + cell.position, row.block.size, kFBlockSize); + const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block( + b + row.block.position, row.block.size); + lhs.noalias() += f_block.transpose() * f_block; + rhs.noalias() += f_block.transpose() * b_block; + } + } + + // This implementation of BackSubstitute depends on Eliminate being called + // before this. SchurComplementSolver always does this. + // + // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z); + void BackSubstitute(const BlockSparseMatrixData& A, + const double* b, + const double* D, + const double* z_ptr, + double* y) override { + typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize); + const CompressedRowBlockStructure* bs = A.block_structure(); + const double* values = A.values(); + Eigen::Matrix<double, kEBlockSize, 1> tmp; + for (int i = 0; i < chunks_.size(); ++i) { + const Chunk& chunk = chunks_[i]; + const int e_block_id = bs->rows[chunk.start].cells.front().block_id; + tmp.setZero(); + for (int j = 0; j < chunk.num_rows; ++j) { + const int row_id = chunk.start + j; + const auto& row = bs->rows[row_id]; + const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef + e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); + const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block( + b + row.block.position, kRowBlockSize); + + if (row.cells.size() == 1) { + // There is no f block. + tmp += e_block.transpose() * b_block; + } else { + typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef + f_block( + values + row.cells[1].position, kRowBlockSize, kFBlockSize); + tmp += e_block.transpose() * (b_block - f_block * z); + } + } + + typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse( + &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], + kEBlockSize, + kEBlockSize); + + typename EigenTypes<kEBlockSize>::VectorRef y_block( + y + bs->cols[e_block_id].position, kEBlockSize); + y_block.noalias() = e_t_e_inverse * tmp; + } + } + + private: + struct Chunk { + int start = 0; + int num_rows = 0; + }; + + std::vector<Chunk> chunks_; + int num_eliminate_blocks_; + int uneliminated_row_begins_; + std::vector<double> e_t_e_inverse_matrices_; }; } // namespace internal |