diff options
Diffstat (limited to 'internal/ceres/low_rank_inverse_hessian.cc')
-rw-r--r-- | internal/ceres/low_rank_inverse_hessian.cc | 108 |
1 files changed, 74 insertions, 34 deletions
diff --git a/internal/ceres/low_rank_inverse_hessian.cc b/internal/ceres/low_rank_inverse_hessian.cc index 372165f..4816e3c 100644 --- a/internal/ceres/low_rank_inverse_hessian.cc +++ b/internal/ceres/low_rank_inverse_hessian.cc @@ -28,6 +28,8 @@ // // Author: sameeragarwal@google.com (Sameer Agarwal) +#include <list> + #include "ceres/internal/eigen.h" #include "ceres/low_rank_inverse_hessian.h" #include "glog/logging.h" @@ -35,6 +37,41 @@ namespace ceres { namespace internal { +// The (L)BFGS algorithm explicitly requires that the secant equation: +// +// B_{k+1} * s_k = y_k +// +// Is satisfied at each iteration, where B_{k+1} is the approximated +// Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and +// y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be +// positive definite, this is equivalent to the condition: +// +// s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0] +// +// This condition would always be satisfied if the function was strictly +// convex, alternatively, it is always satisfied provided that a Wolfe line +// search is used (even if the function is not strictly convex). See [1] +// (p138) for a proof. +// +// Although Ceres will always use a Wolfe line search when using (L)BFGS, +// practical implementation considerations mean that the line search +// may return a point that satisfies only the Armijo condition, and thus +// could violate the Secant equation. As such, we will only use a step +// to update the Hessian approximation if: +// +// s_k^T * y_k > tolerance +// +// It is important that tolerance is very small (and >=0), as otherwise we +// might skip the update too often and fail to capture important curvature +// information in the Hessian. For example going from 1e-10 -> 1e-14 improves +// the NIST benchmark score from 43/54 to 53/54. +// +// [1] Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999. +// +// TODO(alexs.mac): Consider using Damped BFGS update instead of +// skipping update. +const double kLBFGSSecantConditionHessianUpdateTolerance = 1e-14; + LowRankInverseHessian::LowRankInverseHessian( int num_parameters, int max_num_corrections, @@ -42,7 +79,6 @@ LowRankInverseHessian::LowRankInverseHessian( : num_parameters_(num_parameters), max_num_corrections_(max_num_corrections), use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling), - num_corrections_(0), approximate_eigenvalue_scale_(1.0), delta_x_history_(num_parameters, max_num_corrections), delta_gradient_history_(num_parameters, max_num_corrections), @@ -52,35 +88,29 @@ LowRankInverseHessian::LowRankInverseHessian( bool LowRankInverseHessian::Update(const Vector& delta_x, const Vector& delta_gradient) { const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); - if (delta_x_dot_delta_gradient <= 1e-10) { - VLOG(2) << "Skipping LBFGS Update, delta_x_dot_delta_gradient too small: " - << delta_x_dot_delta_gradient; + if (delta_x_dot_delta_gradient <= + kLBFGSSecantConditionHessianUpdateTolerance) { + VLOG(2) << "Skipping L-BFGS Update, delta_x_dot_delta_gradient too " + << "small: " << delta_x_dot_delta_gradient << ", tolerance: " + << kLBFGSSecantConditionHessianUpdateTolerance + << " (Secant condition)."; return false; } - if (num_corrections_ == max_num_corrections_) { - // TODO(sameeragarwal): This can be done more efficiently using - // a circular buffer/indexing scheme, but for simplicity we will - // do the expensive copy for now. - delta_x_history_.block(0, 0, num_parameters_, max_num_corrections_ - 1) = - delta_x_history_ - .block(0, 1, num_parameters_, max_num_corrections_ - 1); - - delta_gradient_history_ - .block(0, 0, num_parameters_, max_num_corrections_ - 1) = - delta_gradient_history_ - .block(0, 1, num_parameters_, max_num_corrections_ - 1); - - delta_x_dot_delta_gradient_.head(num_corrections_ - 1) = - delta_x_dot_delta_gradient_.tail(num_corrections_ - 1); - } else { - ++num_corrections_; + + int next = indices_.size(); + // Once the size of the list reaches max_num_corrections_, simulate + // a circular buffer by removing the first element of the list and + // making it the next position where the LBFGS history is stored. + if (next == max_num_corrections_) { + next = indices_.front(); + indices_.pop_front(); } - delta_x_history_.col(num_corrections_ - 1) = delta_x; - delta_gradient_history_.col(num_corrections_ - 1) = delta_gradient; - delta_x_dot_delta_gradient_(num_corrections_ - 1) = - delta_x_dot_delta_gradient; + indices_.push_back(next); + delta_x_history_.col(next) = delta_x; + delta_gradient_history_.col(next) = delta_gradient; + delta_x_dot_delta_gradient_(next) = delta_x_dot_delta_gradient; approximate_eigenvalue_scale_ = delta_x_dot_delta_gradient / delta_gradient.squaredNorm(); return true; @@ -93,12 +123,16 @@ void LowRankInverseHessian::RightMultiply(const double* x_ptr, search_direction = gradient; - Vector alpha(num_corrections_); + const int num_corrections = indices_.size(); + Vector alpha(num_corrections); - for (int i = num_corrections_ - 1; i >= 0; --i) { - alpha(i) = delta_x_history_.col(i).dot(search_direction) / - delta_x_dot_delta_gradient_(i); - search_direction -= alpha(i) * delta_gradient_history_.col(i); + for (std::list<int>::const_reverse_iterator it = indices_.rbegin(); + it != indices_.rend(); + ++it) { + const double alpha_i = delta_x_history_.col(*it).dot(search_direction) / + delta_x_dot_delta_gradient_(*it); + search_direction -= alpha_i * delta_gradient_history_.col(*it); + alpha(*it) = alpha_i; } if (use_approximate_eigenvalue_scaling_) { @@ -133,12 +167,18 @@ void LowRankInverseHessian::RightMultiply(const double* x_ptr, // 20(5), 863-874, 1974. // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999. search_direction *= approximate_eigenvalue_scale_; + + VLOG(4) << "Applying approximate_eigenvalue_scale: " + << approximate_eigenvalue_scale_ << " to initial inverse Hessian " + << "approximation."; } - for (int i = 0; i < num_corrections_; ++i) { - const double beta = delta_gradient_history_.col(i).dot(search_direction) / - delta_x_dot_delta_gradient_(i); - search_direction += delta_x_history_.col(i) * (alpha(i) - beta); + for (std::list<int>::const_iterator it = indices_.begin(); + it != indices_.end(); + ++it) { + const double beta = delta_gradient_history_.col(*it).dot(search_direction) / + delta_x_dot_delta_gradient_(*it); + search_direction += delta_x_history_.col(*it) * (alpha(*it) - beta); } } |