1 files changed, 47 insertions, 8 deletions
diff --git a/internal/ceres/line_search_direction.cc b/internal/ceres/line_search_direction.cc
index 8ded823..dddcecd 100644
--- a/internal/ceres/line_search_direction.cc
+++ b/internal/ceres/line_search_direction.cc
@@ -28,8 +28,6 @@
 //
 // Author: sameeragarwal@google.com (Sameer Agarwal)
 
-#ifndef CERES_NO_LINE_SEARCH_MINIMIZER
-
 #include "ceres/line_search_direction.h"
 #include "ceres/line_search_minimizer.h"
 #include "ceres/low_rank_inverse_hessian.h"
@@ -67,7 +65,7 @@ class NonlinearConjugateGradient : public LineSearchDirection {
       case FLETCHER_REEVES:
         beta = current.gradient_squared_norm / previous.gradient_squared_norm;
         break;
-      case POLAK_RIBIRERE:
+      case POLAK_RIBIERE:
         gradient_change = current.gradient - previous.gradient;
         beta = (current.gradient.dot(gradient_change) /
                 previous.gradient_squared_norm);
@@ -121,6 +119,7 @@ class LBFGS : public LineSearchDirection {
     low_rank_inverse_hessian_.Update(
         previous.search_direction * previous.step_size,
         current.gradient - previous.gradient);
+
     search_direction->setZero();
     low_rank_inverse_hessian_.RightMultiply(current.gradient.data(),
                                             search_direction->data());
@@ -176,9 +175,46 @@ class BFGS : public LineSearchDirection {
     const Vector delta_gradient = current.gradient - previous.gradient;
     const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);
 
-    if (delta_x_dot_delta_gradient <= 1e-10) {
+    // The (L)BFGS algorithm explicitly requires that the secant equation:
+    //
+    //   B_{k+1} * s_k = y_k
+    //
+    // Is satisfied at each iteration, where B_{k+1} is the approximated
+    // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
+    // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
+    // positive definite, this is equivalent to the condition:
+    //
+    //   s_k^T * y_k > 0     [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
+    //
+    // This condition would always be satisfied if the function was strictly
+    // convex, alternatively, it is always satisfied provided that a Wolfe line
+    // search is used (even if the function is not strictly convex).  See [1]
+    // (p138) for a proof.
+    //
+    // Although Ceres will always use a Wolfe line search when using (L)BFGS,
+    // practical implementation considerations mean that the line search
+    // may return a point that satisfies only the Armijo condition, and thus
+    // could violate the Secant equation.  As such, we will only use a step
+    // to update the Hessian approximation if:
+    //
+    //   s_k^T * y_k > tolerance
+    //
+    // It is important that tolerance is very small (and >=0), as otherwise we
+    // might skip the update too often and fail to capture important curvature
+    // information in the Hessian.  For example going from 1e-10 -> 1e-14
+    // improves the NIST benchmark score from 43/54 to 53/54.
+    //
+    // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999.
+    //
+    // TODO(alexs.mac): Consider using Damped BFGS update instead of
+    // skipping update.
+    const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14;
+    if (delta_x_dot_delta_gradient <=
+        kBFGSSecantConditionHessianUpdateTolerance) {
       VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too "
-              << "small: " << delta_x_dot_delta_gradient;
+              << "small: " << delta_x_dot_delta_gradient << ", tolerance: "
+              << kBFGSSecantConditionHessianUpdateTolerance
+              << " (Secant condition).";
     } else {
       // Update dense inverse Hessian approximation.
 
@@ -214,8 +250,13 @@ class BFGS : public LineSearchDirection {
         //     Part II: Implementation and experiments, Management Science,
         //     20(5), 863-874, 1974.
         // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999.
-        inverse_hessian_ *=
+        const double approximate_eigenvalue_scale =
             delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient);
+        inverse_hessian_ *= approximate_eigenvalue_scale;
+
+        VLOG(4) << "Applying approximate_eigenvalue_scale: "
+                << approximate_eigenvalue_scale << " to initial inverse "
+                << "Hessian approximation.";
       }
       initialized_ = true;
 
@@ -329,5 +370,3 @@ LineSearchDirection::Create(const LineSearchDirection::Options& options) {
 
 }  // namespace internal
 }  // namespace ceres
-
-#endif  // CERES_NO_LINE_SEARCH_MINIMIZER