// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2022 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: sameeragarwal@google.com (Sameer Agarwal) #include "ceres/dense_qr.h" #include #include #include #ifndef CERES_NO_CUDA #include "ceres/context_impl.h" #include "cublas_v2.h" #include "cusolverDn.h" #endif // CERES_NO_CUDA #ifndef CERES_NO_LAPACK // LAPACK routines for solving a linear least squares problem using QR // factorization. This is done in three stages: // // A * x = b // Q * R * x = b (dgeqrf) // R * x = Q' * b (dormqr) // x = R^{-1} * Q'* b (dtrtrs) // clang-format off // Compute the QR factorization of a. // // a is an m x n column major matrix (Denoted by "A" in the above description) // lda is the leading dimension of a. lda >= max(1, num_rows) // tau is an array of size min(m,n). It contains the scalar factors of the // elementary reflectors. // work is an array of size max(1,lwork). On exit, if info=0, work[0] contains // the optimal size of work. // // if lwork >= 1 it is the size of work. If lwork = -1, then a workspace query is assumed. // dgeqrf computes the optimal size of the work array and returns it as work[0]. // // info = 0, successful exit. // info < 0, if info = -i, then the i^th argument had illegal value. extern "C" void dgeqrf_(const int* m, const int* n, double* a, const int* lda, double* tau, double* work, const int* lwork, int* info); // Apply Q or Q' to b. // // b is a m times n column major matrix. // size = 'L' applies Q or Q' on the left, size = 'R' applies Q or Q' on the right. // trans = 'N', applies Q, trans = 'T', applies Q'. // k is the number of elementary reflectors whose product defines the matrix Q. // If size = 'L', m >= k >= 0 and if side = 'R', n >= k >= 0. // a is an lda x k column major matrix containing the reflectors as returned by dgeqrf. // ldb is the leading dimension of b. // work is an array of size max(1, lwork) // lwork if positive is the size of work. If lwork = -1, then a // workspace query is assumed. // // info = 0, successful exit. // info < 0, if info = -i, then the i^th argument had illegal value. extern "C" void dormqr_(const char* side, const char* trans, const int* m, const int* n ,const int* k, double* a, const int* lda, double* tau, double* b, const int* ldb, double* work, const int* lwork, int* info); // Solve a triangular system of the form A * x = b // // uplo = 'U', A is upper triangular. uplo = 'L' is lower triangular. // trans = 'N', 'T', 'C' specifies the form - A, A^T, A^H. // DIAG = 'N', A is not unit triangular. 'U' is unit triangular. // n is the order of the matrix A. // nrhs number of columns of b. // a is a column major lda x n. // b is a column major matrix of ldb x nrhs // // info = 0 succesful. // = -i < 0 i^th argument is an illegal value. // = i > 0, i^th diagonal element of A is zero. extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag, const int* n, const int* nrhs, double* a, const int* lda, double* b, const int* ldb, int* info); // clang-format on #endif namespace ceres { namespace internal { DenseQR::~DenseQR() = default; std::unique_ptr DenseQR::Create(const LinearSolver::Options& options) { std::unique_ptr dense_qr; switch (options.dense_linear_algebra_library_type) { case EIGEN: dense_qr = std::make_unique(); break; case LAPACK: #ifndef CERES_NO_LAPACK dense_qr = std::make_unique(); break; #else LOG(FATAL) << "Ceres was compiled without support for LAPACK."; #endif case CUDA: #ifndef CERES_NO_CUDA dense_qr = CUDADenseQR::Create(options); break; #else LOG(FATAL) << "Ceres was compiled without support for CUDA."; #endif default: LOG(FATAL) << "Unknown dense linear algebra library type : " << DenseLinearAlgebraLibraryTypeToString( options.dense_linear_algebra_library_type); } return dense_qr; } LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows, int num_cols, double* lhs, const double* rhs, double* solution, std::string* message) { LinearSolverTerminationType termination_type = Factorize(num_rows, num_cols, lhs, message); if (termination_type == LINEAR_SOLVER_SUCCESS) { termination_type = Solve(rhs, solution, message); } return termination_type; } LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows, int num_cols, double* lhs, std::string* message) { Eigen::Map m(lhs, num_rows, num_cols); qr_ = std::make_unique(m); *message = "Success."; return LINEAR_SOLVER_SUCCESS; } LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs, double* solution, std::string* message) { VectorRef(solution, qr_->cols()) = qr_->solve(ConstVectorRef(rhs, qr_->rows())); *message = "Success."; return LINEAR_SOLVER_SUCCESS; } #ifndef CERES_NO_LAPACK LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows, int num_cols, double* lhs, std::string* message) { int lwork = -1; double work_size; int info = 0; // Compute the size of the temporary workspace needed to compute the QR // factorization in the dgeqrf call below. dgeqrf_(&num_rows, &num_cols, lhs_, &num_rows, tau_.data(), &work_size, &lwork, &info); if (info < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres." << "Please report it." << "LAPACK::dgels fatal error." << "Argument: " << -info << " is invalid."; } lhs_ = lhs; num_rows_ = num_rows; num_cols_ = num_cols; lwork = static_cast(work_size); if (work_.size() < lwork) { work_.resize(lwork); } if (tau_.size() < num_cols) { tau_.resize(num_cols); } if (q_transpose_rhs_.size() < num_rows) { q_transpose_rhs_.resize(num_rows); } // Factorize the lhs_ using the workspace that we just constructed above. dgeqrf_(&num_rows, &num_cols, lhs_, &num_rows, tau_.data(), work_.data(), &lwork, &info); if (info < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres." << "Please report it. dgeqrf fatal error." << "Argument: " << -info << " is invalid."; } termination_type_ = LINEAR_SOLVER_SUCCESS; *message = "Success."; return termination_type_; } LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs, double* solution, std::string* message) { if (termination_type_ != LINEAR_SOLVER_SUCCESS) { *message = "QR factorization failed and solve called."; return termination_type_; } std::copy_n(rhs, num_rows_, q_transpose_rhs_.data()); const char side = 'L'; char trans = 'T'; const int num_c_cols = 1; const int lwork = work_.size(); int info = 0; dormqr_(&side, &trans, &num_rows_, &num_c_cols, &num_cols_, lhs_, &num_rows_, tau_.data(), q_transpose_rhs_.data(), &num_rows_, work_.data(), &lwork, &info); if (info < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres." << "Please report it. dormr fatal error." << "Argument: " << -info << " is invalid."; } const char uplo = 'U'; trans = 'N'; const char diag = 'N'; dtrtrs_(&uplo, &trans, &diag, &num_cols_, &num_c_cols, lhs_, &num_rows_, q_transpose_rhs_.data(), &num_rows_, &info); if (info < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres." << "Please report it. dormr fatal error." << "Argument: " << -info << " is invalid."; } else if (info > 0) { *message = "QR factorization failure. The factorization is not full rank. R has " "zeros on the diagonal."; termination_type_ = LINEAR_SOLVER_FAILURE; } else { std::copy_n(q_transpose_rhs_.data(), num_cols_, solution); termination_type_ = LINEAR_SOLVER_SUCCESS; } return termination_type_; } #endif // CERES_NO_LAPACK #ifndef CERES_NO_CUDA bool CUDADenseQR::Init(ContextImpl* context, std::string* message) { if (!context->InitCUDA(message)) { return false; } cublas_handle_ = context->cublas_handle_; cusolver_handle_ = context->cusolver_handle_; stream_ = context->stream_; error_.Reserve(1); *message = "CUDADenseQR::Init Success."; return true; } LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows, int num_cols, double* lhs, std::string* message) { factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; lhs_.Reserve(num_rows * num_cols); tau_.Reserve(std::min(num_rows, num_cols)); num_rows_ = num_rows; num_cols_ = num_cols; lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_); int device_workspace_size = 0; if (cusolverDnDgeqrf_bufferSize(cusolver_handle_, num_rows, num_cols, lhs_.data(), num_rows, &device_workspace_size) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); if (cusolverDnDgeqrf(cusolver_handle_, num_rows, num_cols, lhs_.data(), num_rows, tau_.data(), reinterpret_cast(device_workspace_.data()), device_workspace_.size(), error_.data()) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDgeqrf failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_) != cudaSuccess) { *message = "Cuda device synchronization failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } int error = 0; error_.CopyToHost(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " << "please report it. " << "cuSolverDN::cusolverDnDgeqrf fatal error. " << "Argument: " << -error << " is invalid."; // The following line is unreachable, but return failure just to be // pedantic, since the compiler does not know that. return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } *message = "Success"; factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; } LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs, double* solution, std::string* message) { if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) { *message = "Factorize did not complete succesfully previously."; return factorize_result_; } rhs_.CopyToGpuAsync(rhs, num_rows_, stream_); int device_workspace_size = 0; if (cusolverDnDormqr_bufferSize(cusolver_handle_, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, num_rows_, 1, num_cols_, lhs_.data(), num_rows_, tau_.data(), rhs_.data(), num_rows_, &device_workspace_size) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized. // The result of factorization would have stored Q in a packed form in lhs_. if (cusolverDnDormqr(cusolver_handle_, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, num_rows_, 1, num_cols_, lhs_.data(), num_rows_, tau_.data(), rhs_.data(), num_rows_, reinterpret_cast(device_workspace_.data()), device_workspace_.size(), error_.data()) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDormqr failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } int error = 0; error_.CopyToHost(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." << "cuSolverDN::cusolverDnDormqr fatal error. " << "Argument: " << -error << " is invalid."; } // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step // replaced rhs by (Q^T * rhs), this is just x = R \ rhs. if (cublasDtrsv(cublas_handle_, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, num_cols_, lhs_.data(), num_rows_, rhs_.data(), 1) != CUBLAS_STATUS_SUCCESS) { *message = "cuBLAS::cublasDtrsv failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_) != cudaSuccess) { *message = "Cuda device synchronization failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } rhs_.CopyToHost(solution, num_cols_); *message = "Success"; return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; } std::unique_ptr CUDADenseQR::Create( const LinearSolver::Options& options) { if (options.dense_linear_algebra_library_type != CUDA) { // The user called the wrong factory method. return nullptr; } auto cuda_dense_qr = std::unique_ptr(new CUDADenseQR()); std::string cuda_error; if (cuda_dense_qr->Init(options.context, &cuda_error)) { return cuda_dense_qr; } // Initialization failed, destroy the object (done automatically) and return a // nullptr. LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error; return nullptr; } CUDADenseQR::CUDADenseQR() = default; #endif // CERES_NO_CUDA } // namespace internal } // namespace ceres