aboutsummaryrefslogtreecommitdiff
path: root/heuristic_ensemble_matcher.cc
diff options
context:
space:
mode:
authorSamuel Huang <huangs@chromium.org>2018-03-13 18:19:34 +0000
committerEdward Lesmes <ehmaldonado@google.com>2021-07-23 21:50:59 +0000
commit06f1ae9aaca969ee95ef840f22b6b461c304542d (patch)
treef1e5c6624e70628e81fbf38d6cd14b974abe5d93 /heuristic_ensemble_matcher.cc
downloadzucchini-06f1ae9aaca969ee95ef840f22b6b461c304542d.tar.gz
[Zucchini] Move Zucchini from /chrome/installer/ to /components/.
(Use "git log --follow" to see older revisions of files). /components/ is the most logical place to put Zucchini, which only depends on /base and /testing/gtest. This move also enables Zucchini to be used by the Component Updater. Details: - Move all files; run the following to change deps and guards: sed 's/chrome\/installer/components/' *.cc *.h -i sed 's/CHROME_INSTALLER/COMPONENTS/' *.cc *.h -i - Sorting works out pretty well! - Change all 'chrome/installer/zucchini' to 'components/zucchini' throughout other parts of the repo; sort if necessary. - Fix 6 'git cl lint' errors. - Change 1 Bind() usage to BindRepeated(). - Update OWNER. Bug: 729154 Change-Id: I50c5a7d411ea85f707b5994ab319dfb2a1acccf7 Reviewed-on: https://chromium-review.googlesource.com/954923 Reviewed-by: Greg Thompson <grt@chromium.org> Reviewed-by: Jochen Eisinger <jochen@chromium.org> Reviewed-by: Samuel Huang <huangs@chromium.org> Commit-Queue: Samuel Huang <huangs@chromium.org> Cr-Commit-Position: refs/heads/master@{#542857} NOKEYCHECK=True GitOrigin-RevId: 577ef6c435e8d43be6e3e60ccbcbd1881780f4ec
Diffstat (limited to 'heuristic_ensemble_matcher.cc')
-rw-r--r--heuristic_ensemble_matcher.cc369
1 files changed, 369 insertions, 0 deletions
diff --git a/heuristic_ensemble_matcher.cc b/heuristic_ensemble_matcher.cc
new file mode 100644
index 0000000..aead5dc
--- /dev/null
+++ b/heuristic_ensemble_matcher.cc
@@ -0,0 +1,369 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/zucchini/heuristic_ensemble_matcher.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "base/bind.h"
+#include "base/numerics/safe_conversions.h"
+#include "base/strings/stringprintf.h"
+#include "components/zucchini/binary_data_histogram.h"
+#include "components/zucchini/element_detection.h"
+#include "components/zucchini/image_utils.h"
+#include "components/zucchini/io_utils.h"
+
+namespace zucchini {
+
+namespace {
+
+/******** Helper Functions ********/
+
+// Uses |detector| to find embedded executables inside |image|, and returns the
+// result on success, or base::nullopt on failure, which occurs if too many (>
+// |kElementLimit|) elements are found.
+base::Optional<std::vector<Element>> FindEmbeddedElements(
+ ConstBufferView image,
+ const std::string& name,
+ ElementDetector&& detector) {
+ // Maximum number of Elements in a file. This is enforced because our matching
+ // algorithm is O(n^2), which suffices for regular archive files that should
+ // have up to 10's of executable files. An archive containing 100's of
+ // executables is likely pathological, and is rejected to prevent exploits.
+ static constexpr size_t kElementLimit = 256;
+ std::vector<Element> elements;
+ ElementFinder element_finder(image, std::move(detector));
+ for (auto element = element_finder.GetNext();
+ element.has_value() && elements.size() <= kElementLimit;
+ element = element_finder.GetNext()) {
+ elements.push_back(*element);
+ }
+ if (elements.size() >= kElementLimit) {
+ LOG(WARNING) << name << ": Found too many elements.";
+ return base::nullopt;
+ }
+ LOG(INFO) << name << ": Found " << elements.size() << " elements.";
+ return elements;
+}
+
+// Determines whether a proposed comparison between Elements should be rejected
+// early, to decrease the likelihood of creating false-positive matches, which
+// may be costly for patching. Our heuristic simply prohibits big difference in
+// size (relative and absolute) between matched elements.
+bool UnsafeDifference(const Element& old_element, const Element& new_element) {
+ static constexpr double kMaxBloat = 2.0;
+ static constexpr size_t kMinWorrysomeDifference = 2 << 20; // 2MB
+ size_t lo_size = std::min(old_element.size, new_element.size);
+ size_t hi_size = std::max(old_element.size, new_element.size);
+ if (hi_size - lo_size < kMinWorrysomeDifference)
+ return false;
+ if (hi_size < lo_size * kMaxBloat)
+ return false;
+ return true;
+}
+
+std::ostream& operator<<(std::ostream& stream, const Element& elt) {
+ stream << "(" << elt.exe_type << ", " << AsHex<8, size_t>(elt.offset) << " +"
+ << AsHex<8, size_t>(elt.size) << ")";
+ return stream;
+}
+
+/******** MatchingInfoOut ********/
+
+// A class to output detailed information during ensemble matching. Extracting
+// the functionality to a separate class decouples formatting and printing logic
+// from matching logic. The base class consists of stubs.
+class MatchingInfoOut {
+ protected:
+ MatchingInfoOut() = default;
+
+ public:
+ virtual ~MatchingInfoOut() = default;
+ virtual void InitSizes(size_t old_size, size_t new_size) {}
+ virtual void DeclareTypeMismatch(int iold, int inew) {}
+ virtual void DeclareUnsafeDistance(int iold, int inew) {}
+ virtual void DeclareCandidate(int iold, int inew) {}
+ virtual void DeclareMatch(int iold,
+ int inew,
+ double dist,
+ bool is_identical) {}
+ virtual void DeclareOutlier(int iold, int inew) {}
+
+ virtual void OutputCompare(const Element& old_element,
+ const Element& new_element,
+ double dist) {}
+
+ virtual void OutputMatch(const Element& best_old_element,
+ const Element& new_element,
+ bool is_identical,
+ double best_dist) {}
+
+ virtual void OutputScores(const std::string& stats) {}
+
+ virtual void OutputTextGrid() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(MatchingInfoOut);
+};
+
+/******** MatchingInfoTerse ********/
+
+// A terse MatchingInfoOut that prints only basic information, using LOG().
+class MatchingInfoOutTerse : public MatchingInfoOut {
+ public:
+ MatchingInfoOutTerse() = default;
+ ~MatchingInfoOutTerse() override = default;
+
+ void OutputScores(const std::string& stats) override {
+ LOG(INFO) << "Best dists: " << stats;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(MatchingInfoOutTerse);
+};
+
+/******** MatchingInfoOutVerbose ********/
+
+// A verbose MatchingInfoOut that prints detailed information using |out_|,
+// including comparison pairs, scores, and a text grid representation of
+// pairwise matching results.
+class MatchingInfoOutVerbose : public MatchingInfoOut {
+ public:
+ explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {}
+ ~MatchingInfoOutVerbose() override = default;
+
+ // Outputs sizes and initializes |text_grid_|.
+ void InitSizes(size_t old_size, size_t new_size) override {
+ out_ << "Comparing old (" << old_size << " elements) and new (" << new_size
+ << " elements)" << std::endl;
+ text_grid_.assign(new_size, std::string(old_size, '-'));
+ best_dist_.assign(new_size, -1.0);
+ }
+
+ // Functions to update match status in text grid representation.
+
+ void DeclareTypeMismatch(int iold, int inew) override {
+ text_grid_[inew][iold] = 'T';
+ }
+ void DeclareUnsafeDistance(int iold, int inew) override {
+ text_grid_[inew][iold] = 'U';
+ }
+ void DeclareCandidate(int iold, int inew) override {
+ text_grid_[inew][iold] = 'C'; // Provisional.
+ }
+ void DeclareMatch(int iold,
+ int inew,
+ double dist,
+ bool is_identical) override {
+ text_grid_[inew][iold] = is_identical ? 'I' : 'M';
+ best_dist_[inew] = dist;
+ }
+ void DeclareOutlier(int iold, int inew) override {
+ text_grid_[inew][iold] = 'O';
+ }
+
+ // Functions to print detailed information.
+
+ void OutputCompare(const Element& old_element,
+ const Element& new_element,
+ double dist) override {
+ out_ << "Compare old" << old_element << " to new" << new_element << " --> "
+ << base::StringPrintf("%.5f", dist) << std::endl;
+ }
+
+ void OutputMatch(const Element& best_old_element,
+ const Element& new_element,
+ bool is_identical,
+ double best_dist) override {
+ if (is_identical) {
+ out_ << "Skipped old" << best_old_element << " - identical to new"
+ << new_element;
+ } else {
+ out_ << "Matched old" << best_old_element << " to new" << new_element
+ << " --> " << base::StringPrintf("%.5f", best_dist);
+ }
+ out_ << std::endl;
+ }
+
+ void OutputScores(const std::string& stats) override {
+ out_ << "Best dists: " << stats << std::endl;
+ }
+
+ void OutputTextGrid() override {
+ int new_size = static_cast<int>(text_grid_.size());
+ for (int inew = 0; inew < new_size; ++inew) {
+ const std::string& line = text_grid_[inew];
+ out_ << " ";
+ for (char ch : line) {
+ char prefix = (ch == 'I' || ch == 'M') ? '(' : ' ';
+ char suffix = (ch == 'I' || ch == 'M') ? ')' : ' ';
+ out_ << prefix << ch << suffix;
+ }
+ if (best_dist_[inew] >= 0)
+ out_ << " " << base::StringPrintf("%.5f", best_dist_[inew]);
+ out_ << std::endl;
+ }
+ if (!text_grid_.empty()) {
+ out_ << " Legend: I = identical, M = matched, T = type mismatch, "
+ "U = unsafe distance, C = candidate, O = outlier, - = skipped."
+ << std::endl;
+ }
+ }
+
+ private:
+ std::ostream& out_;
+
+ // Text grid representation of matches. Rows correspond to "old" and columns
+ // correspond to "new".
+ std::vector<std::string> text_grid_;
+
+ // For each "new" element, distance of best match. -1 denotes no match.
+ std::vector<double> best_dist_;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(MatchingInfoOutVerbose);
+};
+
+} // namespace
+
+/******** HeuristicEnsembleMatcher ********/
+
+HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(std::ostream* out)
+ : out_(out) {}
+
+HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default;
+
+bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image,
+ ConstBufferView new_image) {
+ DCHECK(matches_.empty());
+ LOG(INFO) << "Start matching.";
+
+ // Find all elements in "old" and "new".
+ base::Optional<std::vector<Element>> old_elements =
+ FindEmbeddedElements(old_image, "Old file",
+ base::BindRepeating(DetectElementFromDisassembler));
+ if (!old_elements.has_value())
+ return false;
+ base::Optional<std::vector<Element>> new_elements =
+ FindEmbeddedElements(new_image, "New file",
+ base::BindRepeating(DetectElementFromDisassembler));
+ if (!new_elements.has_value())
+ return false;
+
+ std::unique_ptr<MatchingInfoOut> info_out;
+ if (out_)
+ info_out = std::make_unique<MatchingInfoOutVerbose>(*out_);
+ else
+ info_out = std::make_unique<MatchingInfoOutTerse>();
+
+ const int num_new_elements = base::checked_cast<int>(new_elements->size());
+ const int num_old_elements = base::checked_cast<int>(old_elements->size());
+ info_out->InitSizes(num_old_elements, num_new_elements);
+
+ // For each "new" element, match it with the "old" element that's nearest to
+ // it, with distance determined by BinaryDataHistogram. The resulting
+ // "old"-"new" pairs are stored into |results|. Possibilities:
+ // - Type mismatch: No match.
+ // - UnsafeDifference() heuristics fail: No match.
+ // - Identical match: Skip "new" since this is a trivial case.
+ // - Non-identical match: Match "new" with "old" with min distance.
+ // - No match: Skip "new".
+ struct Results {
+ int iold;
+ int inew;
+ double dist;
+ };
+ std::vector<Results> results;
+
+ // Precompute histograms for "old" since they get reused.
+ std::vector<BinaryDataHistogram> old_his(num_old_elements);
+ for (int iold = 0; iold < num_old_elements; ++iold) {
+ ConstBufferView sub_image(old_image[(*old_elements)[iold]]);
+ old_his[iold].Compute(sub_image);
+ // ProgramDetector should have imposed minimal size limit to |sub_image|.
+ // Therefore resulting histogram are expected to be valid.
+ CHECK(old_his[iold].IsValid());
+ }
+
+ const int kUninitIold = num_old_elements;
+ for (int inew = 0; inew < num_new_elements; ++inew) {
+ const Element& cur_new_element = (*new_elements)[inew];
+ ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]);
+ BinaryDataHistogram new_his;
+ new_his.Compute(cur_new_sub_image);
+ CHECK(new_his.IsValid());
+
+ double best_dist = HUGE_VAL;
+ int best_iold = kUninitIold;
+ bool is_identical = false;
+
+ for (int iold = 0; iold < num_old_elements; ++iold) {
+ const Element& cur_old_element = (*old_elements)[iold];
+ if (cur_old_element.exe_type != cur_new_element.exe_type) {
+ info_out->DeclareTypeMismatch(iold, inew);
+ continue;
+ }
+ if (UnsafeDifference(cur_old_element, cur_new_element)) {
+ info_out->DeclareUnsafeDistance(iold, inew);
+ continue;
+ }
+ double dist = old_his[iold].Distance(new_his);
+ info_out->DeclareCandidate(iold, inew);
+ info_out->OutputCompare(cur_old_element, cur_new_element, dist);
+ if (best_dist > dist) { // Tie resolution: First-one, first-serve.
+ best_iold = iold;
+ best_dist = dist;
+ if (best_dist == 0) {
+ ConstBufferView sub_image(old_image[cur_old_element.region()]);
+ if (sub_image.equals(cur_new_sub_image)) {
+ is_identical = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (best_iold != kUninitIold) {
+ const Element& best_old_element = (*old_elements)[best_iold];
+ info_out->DeclareMatch(best_iold, inew, best_dist, is_identical);
+ if (is_identical) // Skip "new" if identical match is found.
+ ++num_identical_;
+ else
+ results.push_back({best_iold, inew, best_dist});
+ info_out->OutputMatch(best_old_element, cur_new_element, is_identical,
+ best_dist);
+ }
+ }
+
+ // Populate |matches_| from |result|. To reduce that chance of false-positive
+ // matches, statistics on dists are computed. If a match's |dist| is an
+ // outlier then it is rejected.
+ if (results.size() > 0) {
+ OutlierDetector detector;
+ for (const auto& result : results) {
+ if (result.dist > 0)
+ detector.Add(result.dist);
+ }
+ detector.Prepare();
+ info_out->OutputScores(detector.RenderStats());
+ for (const Results& result : results) {
+ if (detector.DecideOutlier(result.dist) > 0) {
+ info_out->DeclareOutlier(result.iold, result.inew);
+ } else {
+ matches_.push_back(
+ {(*old_elements)[result.iold], (*new_elements)[result.inew]});
+ }
+ }
+ info_out->OutputTextGrid();
+ }
+
+ Trim();
+ return true;
+}
+
+} // namespace zucchini