// Copyright 2017 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/zucchini/heuristic_ensemble_matcher.h" #include #include #include #include #include #include "base/bind.h" #include "base/logging.h" #include "base/numerics/safe_conversions.h" #include "base/strings/stringprintf.h" #include "components/zucchini/binary_data_histogram.h" #include "components/zucchini/element_detection.h" #include "components/zucchini/image_utils.h" #include "components/zucchini/io_utils.h" namespace zucchini { namespace { /******** Helper Functions ********/ // Uses |detector| to find embedded executables inside |image|, and returns the // result on success, or absl::nullopt on failure, which occurs if too many (> // |kElementLimit|) elements are found. absl::optional> FindEmbeddedElements( ConstBufferView image, const std::string& name, ElementDetector&& detector) { // Maximum number of Elements in a file. This is enforced because our matching // algorithm is O(n^2), which suffices for regular archive files that should // have up to 10's of executable files. An archive containing 100's of // executables is likely pathological, and is rejected to prevent exploits. static constexpr size_t kElementLimit = 256; std::vector elements; ElementFinder element_finder(image, std::move(detector)); for (auto element = element_finder.GetNext(); element.has_value() && elements.size() <= kElementLimit; element = element_finder.GetNext()) { elements.push_back(*element); } if (elements.size() >= kElementLimit) { LOG(WARNING) << name << ": Found too many elements."; return absl::nullopt; } LOG(INFO) << name << ": Found " << elements.size() << " elements."; return elements; } // Determines whether a proposed comparison between Elements should be rejected // early, to decrease the likelihood of creating false-positive matches, which // may be costly for patching. Our heuristic simply prohibits big difference in // size (relative and absolute) between matched elements. bool UnsafeDifference(const Element& old_element, const Element& new_element) { static constexpr double kMaxBloat = 2.0; static constexpr size_t kMinWorrysomeDifference = 2 << 20; // 2MB size_t lo_size = std::min(old_element.size, new_element.size); size_t hi_size = std::max(old_element.size, new_element.size); if (hi_size - lo_size < kMinWorrysomeDifference) return false; if (hi_size < lo_size * kMaxBloat) return false; return true; } std::ostream& operator<<(std::ostream& stream, const Element& elt) { stream << "(" << CastExecutableTypeToString(elt.exe_type) << ", " << AsHex<8, size_t>(elt.offset) << " +" << AsHex<8, size_t>(elt.size) << ")"; return stream; } /******** MatchingInfoOut ********/ // A class to output detailed information during ensemble matching. Extracting // the functionality to a separate class decouples formatting and printing logic // from matching logic. The base class consists of stubs. class MatchingInfoOut { protected: MatchingInfoOut() = default; MatchingInfoOut(const MatchingInfoOut&) = delete; const MatchingInfoOut& operator=(const MatchingInfoOut&) = delete; public: virtual ~MatchingInfoOut() = default; virtual void InitSizes(size_t old_size, size_t new_size) {} virtual void DeclareTypeMismatch(int iold, int inew) {} virtual void DeclareUnsafeDistance(int iold, int inew) {} virtual void DeclareCandidate(int iold, int inew) {} virtual void DeclareMatch(int iold, int inew, double dist, bool is_identical) {} virtual void DeclareOutlier(int iold, int inew) {} virtual void OutputCompare(const Element& old_element, const Element& new_element, double dist) {} virtual void OutputMatch(const Element& best_old_element, const Element& new_element, bool is_identical, double best_dist) {} virtual void OutputScores(const std::string& stats) {} virtual void OutputTextGrid() {} }; /******** MatchingInfoTerse ********/ // A terse MatchingInfoOut that prints only basic information, using LOG(). class MatchingInfoOutTerse : public MatchingInfoOut { public: MatchingInfoOutTerse() = default; MatchingInfoOutTerse(const MatchingInfoOutTerse&) = delete; const MatchingInfoOutTerse& operator=(const MatchingInfoOutTerse&) = delete; ~MatchingInfoOutTerse() override = default; void OutputScores(const std::string& stats) override { LOG(INFO) << "Best dists: " << stats; } }; /******** MatchingInfoOutVerbose ********/ // A verbose MatchingInfoOut that prints detailed information using |out_|, // including comparison pairs, scores, and a text grid representation of // pairwise matching results. class MatchingInfoOutVerbose : public MatchingInfoOut { public: explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {} MatchingInfoOutVerbose(const MatchingInfoOutVerbose&) = delete; const MatchingInfoOutVerbose& operator=(const MatchingInfoOutVerbose&) = delete; ~MatchingInfoOutVerbose() override = default; // Outputs sizes and initializes |text_grid_|. void InitSizes(size_t old_size, size_t new_size) override { out_ << "Comparing old (" << old_size << " elements) and new (" << new_size << " elements)" << std::endl; text_grid_.assign(new_size, std::string(old_size, '-')); best_dist_.assign(new_size, -1.0); } // Functions to update match status in text grid representation. void DeclareTypeMismatch(int iold, int inew) override { text_grid_[inew][iold] = 'T'; } void DeclareUnsafeDistance(int iold, int inew) override { text_grid_[inew][iold] = 'U'; } void DeclareCandidate(int iold, int inew) override { text_grid_[inew][iold] = 'C'; // Provisional. } void DeclareMatch(int iold, int inew, double dist, bool is_identical) override { text_grid_[inew][iold] = is_identical ? 'I' : 'M'; best_dist_[inew] = dist; } void DeclareOutlier(int iold, int inew) override { text_grid_[inew][iold] = 'O'; } // Functions to print detailed information. void OutputCompare(const Element& old_element, const Element& new_element, double dist) override { out_ << "Compare old" << old_element << " to new" << new_element << " --> " << base::StringPrintf("%.5f", dist) << std::endl; } void OutputMatch(const Element& best_old_element, const Element& new_element, bool is_identical, double best_dist) override { if (is_identical) { out_ << "Skipped old" << best_old_element << " - identical to new" << new_element; } else { out_ << "Matched old" << best_old_element << " to new" << new_element << " --> " << base::StringPrintf("%.5f", best_dist); } out_ << std::endl; } void OutputScores(const std::string& stats) override { out_ << "Best dists: " << stats << std::endl; } void OutputTextGrid() override { int new_size = static_cast(text_grid_.size()); for (int inew = 0; inew < new_size; ++inew) { const std::string& line = text_grid_[inew]; out_ << " "; for (char ch : line) { char prefix = (ch == 'I' || ch == 'M') ? '(' : ' '; char suffix = (ch == 'I' || ch == 'M') ? ')' : ' '; out_ << prefix << ch << suffix; } if (best_dist_[inew] >= 0) out_ << " " << base::StringPrintf("%.5f", best_dist_[inew]); out_ << std::endl; } if (!text_grid_.empty()) { out_ << " Legend: I = identical, M = matched, T = type mismatch, " "U = unsafe distance, C = candidate, O = outlier, - = skipped." << std::endl; } } private: std::ostream& out_; // Text grid representation of matches. Rows correspond to "old" and columns // correspond to "new". std::vector text_grid_; // For each "new" element, distance of best match. -1 denotes no match. std::vector best_dist_; }; } // namespace /******** HeuristicEnsembleMatcher ********/ HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(std::ostream* out) : out_(out) {} HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default; bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image, ConstBufferView new_image) { DCHECK(matches_.empty()); LOG(INFO) << "Start matching."; // Find all elements in "old" and "new". absl::optional> old_elements = FindEmbeddedElements(old_image, "Old file", base::BindRepeating(DetectElementFromDisassembler)); if (!old_elements.has_value()) return false; absl::optional> new_elements = FindEmbeddedElements(new_image, "New file", base::BindRepeating(DetectElementFromDisassembler)); if (!new_elements.has_value()) return false; std::unique_ptr info_out; if (out_) info_out = std::make_unique(*out_); else info_out = std::make_unique(); const int num_new_elements = base::checked_cast(new_elements->size()); const int num_old_elements = base::checked_cast(old_elements->size()); info_out->InitSizes(num_old_elements, num_new_elements); // For each "new" element, match it with the "old" element that's nearest to // it, with distance determined by BinaryDataHistogram. The resulting // "old"-"new" pairs are stored into |results|. Possibilities: // - Type mismatch: No match. // - UnsafeDifference() heuristics fail: No match. // - Identical match: Skip "new" since this is a trivial case. // - Non-identical match: Match "new" with "old" with min distance. // - No match: Skip "new". struct Results { int iold; int inew; double dist; }; std::vector results; // Precompute histograms for "old" since they get reused. std::vector old_his(num_old_elements); for (int iold = 0; iold < num_old_elements; ++iold) { ConstBufferView sub_image(old_image[(*old_elements)[iold]]); old_his[iold].Compute(sub_image); // ProgramDetector should have imposed minimal size limit to |sub_image|. // Therefore resulting histogram are expected to be valid. CHECK(old_his[iold].IsValid()); } const int kUninitIold = num_old_elements; for (int inew = 0; inew < num_new_elements; ++inew) { const Element& cur_new_element = (*new_elements)[inew]; ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]); BinaryDataHistogram new_his; new_his.Compute(cur_new_sub_image); CHECK(new_his.IsValid()); double best_dist = HUGE_VAL; int best_iold = kUninitIold; bool is_identical = false; for (int iold = 0; iold < num_old_elements; ++iold) { const Element& cur_old_element = (*old_elements)[iold]; if (cur_old_element.exe_type != cur_new_element.exe_type) { info_out->DeclareTypeMismatch(iold, inew); continue; } if (UnsafeDifference(cur_old_element, cur_new_element)) { info_out->DeclareUnsafeDistance(iold, inew); continue; } double dist = old_his[iold].Distance(new_his); info_out->DeclareCandidate(iold, inew); info_out->OutputCompare(cur_old_element, cur_new_element, dist); if (best_dist > dist) { // Tie resolution: First-one, first-serve. best_iold = iold; best_dist = dist; if (best_dist == 0) { ConstBufferView sub_image(old_image[cur_old_element.region()]); if (sub_image.equals(cur_new_sub_image)) { is_identical = true; break; } } } } if (best_iold != kUninitIold) { const Element& best_old_element = (*old_elements)[best_iold]; info_out->DeclareMatch(best_iold, inew, best_dist, is_identical); if (is_identical) // Skip "new" if identical match is found. ++num_identical_; else results.push_back({best_iold, inew, best_dist}); info_out->OutputMatch(best_old_element, cur_new_element, is_identical, best_dist); } } // Populate |matches_| from |result|. To reduce that chance of false-positive // matches, statistics on dists are computed. If a match's |dist| is an // outlier then it is rejected. if (results.size() > 0) { OutlierDetector detector; for (const auto& result : results) { if (result.dist > 0) detector.Add(result.dist); } detector.Prepare(); info_out->OutputScores(detector.RenderStats()); for (const Results& result : results) { if (detector.DecideOutlier(result.dist) > 0) { info_out->DeclareOutlier(result.iold, result.inew); } else { matches_.push_back( {(*old_elements)[result.iold], (*new_elements)[result.inew]}); } } info_out->OutputTextGrid(); } Trim(); return true; } } // namespace zucchini