1 files changed, 58 insertions, 34 deletions
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 1befde7..19d545c 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -38,6 +38,7 @@
 #include <utility>
 #include <vector>
 #include "Eigen/SVD"
+#include "ceres/compressed_col_sparse_matrix_utils.h"
 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/covariance.h"
 #include "ceres/crs_matrix.h"
@@ -48,7 +49,6 @@
 #include "ceres/suitesparse.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"
-#include "SuiteSparseQR.hpp"
 
 namespace ceres {
 namespace internal {
@@ -56,6 +56,7 @@ namespace {
 
 // Per thread storage for SuiteSparse.
 #ifndef CERES_NO_SUITESPARSE
+
 struct PerThreadContext {
   explicit PerThreadContext(int num_rows)
       : solution(NULL),
@@ -81,6 +82,7 @@ struct PerThreadContext {
   cholmod_dense* rhs;
   SuiteSparse ss;
 };
+
 #endif
 
 }  // namespace
@@ -604,16 +606,8 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingSparseQR() {
   const int num_cols = jacobian.num_cols;
   const int num_nonzeros = jacobian.values.size();
 
-  // UF_long is deprecated but SuiteSparse_long is only available in
-  // newer versions of SuiteSparse.
-#if (SUITESPARSE_VERSION < 4002)
-  vector<UF_long> transpose_rows(num_cols + 1, 0);
-  vector<UF_long> transpose_cols(num_nonzeros, 0);
-#else
   vector<SuiteSparse_long> transpose_rows(num_cols + 1, 0);
   vector<SuiteSparse_long> transpose_cols(num_nonzeros, 0);
-#endif
-
   vector<double> transpose_values(num_nonzeros, 0);
 
   for (int idx = 0; idx < num_nonzeros; ++idx) {
@@ -658,23 +652,49 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingSparseQR() {
   cholmod_common cc;
   cholmod_l_start(&cc);
 
-  SuiteSparseQR_factorization<double>* factor =
-      SuiteSparseQR_factorize<double>(SPQR_ORDERING_BESTAMD,
-                                      SPQR_DEFAULT_TOL,
-                                      &cholmod_jacobian,
-                                      &cc);
+  cholmod_sparse* R = NULL;
+  SuiteSparse_long* permutation = NULL;
+
+  // Compute a Q-less QR factorization of the Jacobian. Since we are
+  // only interested in inverting J'J = R'R, we do not need Q. This
+  // saves memory and gives us R as a permuted compressed column
+  // sparse matrix.
+  //
+  // TODO(sameeragarwal): Currently the symbolic factorization and the
+  // numeric factorization is done at the same time, and this does not
+  // explicitly account for the block column and row structure in the
+  // matrix. When using AMD, we have observed in the past that
+  // computing the ordering with the block matrix is significantly
+  // more efficient, both in runtime as well as the quality of
+  // ordering computed. So, it maybe worth doing that analysis
+  // separately.
+  const SuiteSparse_long rank =
+      SuiteSparseQR<double>(SPQR_ORDERING_BESTAMD,
+                            SPQR_DEFAULT_TOL,
+                            cholmod_jacobian.ncol,
+                            &cholmod_jacobian,
+                            &R,
+                            &permutation,
+                            &cc);
   event_logger.AddEvent("Numeric Factorization");
+  CHECK_NOTNULL(permutation);
+  CHECK_NOTNULL(R);
 
-  const int rank = cc.SPQR_istat[4];
   if (rank < cholmod_jacobian.ncol) {
     LOG(WARNING) << "Jacobian matrix is rank deficient."
                  << "Number of columns: " << cholmod_jacobian.ncol
                  << " rank: " << rank;
-    SuiteSparseQR_free(&factor, &cc);
+    delete []permutation;
+    cholmod_l_free_sparse(&R, &cc);
     cholmod_l_finish(&cc);
     return false;
   }
 
+  vector<int> inverse_permutation(num_cols);
+  for (SuiteSparse_long i = 0; i < num_cols; ++i) {
+    inverse_permutation[permutation[i]] = i;
+  }
+
   const int* rows = covariance_matrix_->rows();
   const int* cols = covariance_matrix_->cols();
   double* values = covariance_matrix_->mutable_values();
@@ -688,35 +708,39 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingSparseQR() {
   //
   // Since the covariance matrix is symmetric, the i^th row and column
   // are equal.
+  const int num_threads = options_.num_threads;
+  scoped_array<double> workspace(new double[num_threads * num_cols]);
 
-  cholmod_dense* rhs = cholmod_l_zeros(num_cols, 1, CHOLMOD_REAL, &cc);
-  double* rhs_x = reinterpret_cast<double*>(rhs->x);
-
+#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
   for (int r = 0; r < num_cols; ++r) {
-    int row_begin = rows[r];
-    int row_end = rows[r + 1];
+    const int row_begin = rows[r];
+    const int row_end = rows[r + 1];
     if (row_end == row_begin) {
       continue;
     }
 
-    rhs_x[r] = 1.0;
-
-    cholmod_dense* y1 = SuiteSparseQR_solve<double>(SPQR_RTX_EQUALS_ETB, factor, rhs, &cc);
-    cholmod_dense* solution = SuiteSparseQR_solve<double>(SPQR_RETX_EQUALS_B, factor, y1, &cc);
+#  ifdef CERES_USE_OPENMP
+    int thread_id = omp_get_thread_num();
+#  else
+    int thread_id = 0;
+#  endif
 
-    double* solution_x = reinterpret_cast<double*>(solution->x);
+    double* solution = workspace.get() + thread_id * num_cols;
+    SolveRTRWithSparseRHS<SuiteSparse_long>(
+        num_cols,
+        static_cast<SuiteSparse_long*>(R->i),
+        static_cast<SuiteSparse_long*>(R->p),
+        static_cast<double*>(R->x),
+        inverse_permutation[r],
+        solution);
     for (int idx = row_begin; idx < row_end; ++idx) {
-      const int c = cols[idx];
-      values[idx] = solution_x[c];
+     const int c = cols[idx];
+     values[idx] = solution[inverse_permutation[c]];
     }
-
-    cholmod_l_free_dense(&y1, &cc);
-    cholmod_l_free_dense(&solution, &cc);
-    rhs_x[r] = 0.0;
   }
 
-  cholmod_l_free_dense(&rhs, &cc);
-  SuiteSparseQR_free(&factor, &cc);
+  delete []permutation;
+  cholmod_l_free_sparse(&R, &cc);
   cholmod_l_finish(&cc);
   event_logger.AddEvent("Inversion");
   return true;