aboutsummaryrefslogtreecommitdiff
path: root/internal/ceres/low_rank_inverse_hessian.cc
diff options
context:
space:
mode:
Diffstat (limited to 'internal/ceres/low_rank_inverse_hessian.cc')
-rw-r--r--internal/ceres/low_rank_inverse_hessian.cc108
1 files changed, 74 insertions, 34 deletions
diff --git a/internal/ceres/low_rank_inverse_hessian.cc b/internal/ceres/low_rank_inverse_hessian.cc
index 372165f..4816e3c 100644
--- a/internal/ceres/low_rank_inverse_hessian.cc
+++ b/internal/ceres/low_rank_inverse_hessian.cc
@@ -28,6 +28,8 @@
//
// Author: sameeragarwal@google.com (Sameer Agarwal)
+#include <list>
+
#include "ceres/internal/eigen.h"
#include "ceres/low_rank_inverse_hessian.h"
#include "glog/logging.h"
@@ -35,6 +37,41 @@
namespace ceres {
namespace internal {
+// The (L)BFGS algorithm explicitly requires that the secant equation:
+//
+// B_{k+1} * s_k = y_k
+//
+// Is satisfied at each iteration, where B_{k+1} is the approximated
+// Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
+// y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
+// positive definite, this is equivalent to the condition:
+//
+// s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
+//
+// This condition would always be satisfied if the function was strictly
+// convex, alternatively, it is always satisfied provided that a Wolfe line
+// search is used (even if the function is not strictly convex). See [1]
+// (p138) for a proof.
+//
+// Although Ceres will always use a Wolfe line search when using (L)BFGS,
+// practical implementation considerations mean that the line search
+// may return a point that satisfies only the Armijo condition, and thus
+// could violate the Secant equation. As such, we will only use a step
+// to update the Hessian approximation if:
+//
+// s_k^T * y_k > tolerance
+//
+// It is important that tolerance is very small (and >=0), as otherwise we
+// might skip the update too often and fail to capture important curvature
+// information in the Hessian. For example going from 1e-10 -> 1e-14 improves
+// the NIST benchmark score from 43/54 to 53/54.
+//
+// [1] Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999.
+//
+// TODO(alexs.mac): Consider using Damped BFGS update instead of
+// skipping update.
+const double kLBFGSSecantConditionHessianUpdateTolerance = 1e-14;
+
LowRankInverseHessian::LowRankInverseHessian(
int num_parameters,
int max_num_corrections,
@@ -42,7 +79,6 @@ LowRankInverseHessian::LowRankInverseHessian(
: num_parameters_(num_parameters),
max_num_corrections_(max_num_corrections),
use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling),
- num_corrections_(0),
approximate_eigenvalue_scale_(1.0),
delta_x_history_(num_parameters, max_num_corrections),
delta_gradient_history_(num_parameters, max_num_corrections),
@@ -52,35 +88,29 @@ LowRankInverseHessian::LowRankInverseHessian(
bool LowRankInverseHessian::Update(const Vector& delta_x,
const Vector& delta_gradient) {
const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);
- if (delta_x_dot_delta_gradient <= 1e-10) {
- VLOG(2) << "Skipping LBFGS Update, delta_x_dot_delta_gradient too small: "
- << delta_x_dot_delta_gradient;
+ if (delta_x_dot_delta_gradient <=
+ kLBFGSSecantConditionHessianUpdateTolerance) {
+ VLOG(2) << "Skipping L-BFGS Update, delta_x_dot_delta_gradient too "
+ << "small: " << delta_x_dot_delta_gradient << ", tolerance: "
+ << kLBFGSSecantConditionHessianUpdateTolerance
+ << " (Secant condition).";
return false;
}
- if (num_corrections_ == max_num_corrections_) {
- // TODO(sameeragarwal): This can be done more efficiently using
- // a circular buffer/indexing scheme, but for simplicity we will
- // do the expensive copy for now.
- delta_x_history_.block(0, 0, num_parameters_, max_num_corrections_ - 1) =
- delta_x_history_
- .block(0, 1, num_parameters_, max_num_corrections_ - 1);
-
- delta_gradient_history_
- .block(0, 0, num_parameters_, max_num_corrections_ - 1) =
- delta_gradient_history_
- .block(0, 1, num_parameters_, max_num_corrections_ - 1);
-
- delta_x_dot_delta_gradient_.head(num_corrections_ - 1) =
- delta_x_dot_delta_gradient_.tail(num_corrections_ - 1);
- } else {
- ++num_corrections_;
+
+ int next = indices_.size();
+ // Once the size of the list reaches max_num_corrections_, simulate
+ // a circular buffer by removing the first element of the list and
+ // making it the next position where the LBFGS history is stored.
+ if (next == max_num_corrections_) {
+ next = indices_.front();
+ indices_.pop_front();
}
- delta_x_history_.col(num_corrections_ - 1) = delta_x;
- delta_gradient_history_.col(num_corrections_ - 1) = delta_gradient;
- delta_x_dot_delta_gradient_(num_corrections_ - 1) =
- delta_x_dot_delta_gradient;
+ indices_.push_back(next);
+ delta_x_history_.col(next) = delta_x;
+ delta_gradient_history_.col(next) = delta_gradient;
+ delta_x_dot_delta_gradient_(next) = delta_x_dot_delta_gradient;
approximate_eigenvalue_scale_ =
delta_x_dot_delta_gradient / delta_gradient.squaredNorm();
return true;
@@ -93,12 +123,16 @@ void LowRankInverseHessian::RightMultiply(const double* x_ptr,
search_direction = gradient;
- Vector alpha(num_corrections_);
+ const int num_corrections = indices_.size();
+ Vector alpha(num_corrections);
- for (int i = num_corrections_ - 1; i >= 0; --i) {
- alpha(i) = delta_x_history_.col(i).dot(search_direction) /
- delta_x_dot_delta_gradient_(i);
- search_direction -= alpha(i) * delta_gradient_history_.col(i);
+ for (std::list<int>::const_reverse_iterator it = indices_.rbegin();
+ it != indices_.rend();
+ ++it) {
+ const double alpha_i = delta_x_history_.col(*it).dot(search_direction) /
+ delta_x_dot_delta_gradient_(*it);
+ search_direction -= alpha_i * delta_gradient_history_.col(*it);
+ alpha(*it) = alpha_i;
}
if (use_approximate_eigenvalue_scaling_) {
@@ -133,12 +167,18 @@ void LowRankInverseHessian::RightMultiply(const double* x_ptr,
// 20(5), 863-874, 1974.
// [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999.
search_direction *= approximate_eigenvalue_scale_;
+
+ VLOG(4) << "Applying approximate_eigenvalue_scale: "
+ << approximate_eigenvalue_scale_ << " to initial inverse Hessian "
+ << "approximation.";
}
- for (int i = 0; i < num_corrections_; ++i) {
- const double beta = delta_gradient_history_.col(i).dot(search_direction) /
- delta_x_dot_delta_gradient_(i);
- search_direction += delta_x_history_.col(i) * (alpha(i) - beta);
+ for (std::list<int>::const_iterator it = indices_.begin();
+ it != indices_.end();
+ ++it) {
+ const double beta = delta_gradient_history_.col(*it).dot(search_direction) /
+ delta_x_dot_delta_gradient_(*it);
+ search_direction += delta_x_history_.col(*it) * (alpha(*it) - beta);
}
}