From d0fbf8ac238d3bc3a111334ab3553173812d78e7 Mon Sep 17 00:00:00 2001
From: Liqiang TAO <taolq@outlook.com>
Date: Tue, 25 Jan 2022 19:14:20 +0900
Subject: Cache PerfCounters instance in PerfCountersMeasurement (#1308)

This patch fixes #1306, by reducing the pinned instances of
PerfCounters.

The issue is caused by creating multiple pinned events in the
same thread, doing so results in the Snapshot(PerfCounterValues* values)
failing, and that's now discoverable.
Creating multile pinned events is an unsupported behavior currently.
The error would be detected at read() time, not
perf_event_open() / iotcl() time.

The unsupported benavior above is confirmed by Stephane Eranian @seranian,
and he also pointed the dectection method.

Finished this patch under the guidance of Mircea Trofin @mtrofin.
---
 src/benchmark.cc            |  5 ++++-
 src/benchmark_runner.cc     |  3 +--
 src/perf_counters.cc        | 42 ++++++++++++++++++++++++++++++++++--
 src/perf_counters.h         | 52 +++++++++++++++++++++++++++------------------
 test/perf_counters_gtest.cc | 48 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 124 insertions(+), 26 deletions(-)
diff --git a/src/benchmark.cc b/src/benchmark.cc
index cedeee3..1a76b58 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -184,7 +184,10 @@ void State::PauseTiming() {
   BM_CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StopTimer();
   if (perf_counters_measurement_) {
-    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
     for (const auto& name_and_measurement : measurements) {
       auto name = name_and_measurement.first;
       auto measurement = name_and_measurement.second;
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index eac807b..ec7c127 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -152,8 +152,7 @@ BenchmarkRunner::BenchmarkRunner(
       has_explicit_iteration_count(b.iterations() != 0),
       pool(b.threads() - 1),
       iters(has_explicit_iteration_count ? b.iterations() : 1),
-      perf_counters_measurement(
-          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+      perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')),
       perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
                                         ? &perf_counters_measurement
                                         : nullptr) {
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
index b2ac768..582475f 100644
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@@ -15,6 +15,7 @@
 #include "perf_counters.h"
 
 #include <cstring>
+#include <memory>
 #include <vector>
 
 #if defined HAVE_LIBPFM
@@ -104,7 +105,7 @@ PerfCounters PerfCounters::Create(
   return PerfCounters(counter_names, std::move(counter_ids));
 }
 
-PerfCounters::~PerfCounters() {
+void PerfCounters::CloseCounters() const {
   if (counter_ids_.empty()) {
     return;
   }
@@ -126,7 +127,44 @@ PerfCounters PerfCounters::Create(
   return NoCounters();
 }
 
-PerfCounters::~PerfCounters() = default;
+void PerfCounters::CloseCounters() const {}
 #endif  // defined HAVE_LIBPFM
+
+Mutex PerfCountersMeasurement::mutex_;
+int PerfCountersMeasurement::ref_count_ = 0;
+PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters();
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  MutexLock l(mutex_);
+  if (ref_count_ == 0) {
+    counters_ = PerfCounters::Create(counter_names);
+  }
+  // We chose to increment it even if `counters_` ends up invalid,
+  // so that we don't keep trying to create, and also since the dtor
+  // will decrement regardless of `counters_`'s validity
+  ++ref_count_;
+
+  BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names);
+}
+
+PerfCountersMeasurement::~PerfCountersMeasurement() {
+  MutexLock l(mutex_);
+  --ref_count_;
+  if (ref_count_ == 0) {
+    counters_ = PerfCounters::NoCounters();
+  }
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
 }  // namespace internal
 }  // namespace benchmark
diff --git a/src/perf_counters.h b/src/perf_counters.h
index 47ca138..9a6e46c 100644
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@@ -17,11 +17,13 @@
 
 #include <array>
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "benchmark/benchmark.h"
 #include "check.h"
 #include "log.h"
+#include "mutex.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
 #include <unistd.h>
@@ -71,12 +73,14 @@ class PerfCounters final {
   // True iff this platform supports performance counters.
   static const bool kSupported;
 
-  bool IsValid() const { return is_valid_; }
+  bool IsValid() const { return !counter_names_.empty(); }
   static PerfCounters NoCounters() { return PerfCounters(); }
 
-  ~PerfCounters();
+  ~PerfCounters() { CloseCounters(); }
   PerfCounters(PerfCounters&&) = default;
   PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
 
   // Platform-specific implementations may choose to do some library
   // initialization here.
@@ -111,24 +115,27 @@ class PerfCounters final {
  private:
   PerfCounters(const std::vector<std::string>& counter_names,
                std::vector<int>&& counter_ids)
-      : counter_ids_(std::move(counter_ids)),
-        counter_names_(counter_names),
-        is_valid_(true) {}
-  PerfCounters() : is_valid_(false) {}
+      : counter_ids_(std::move(counter_ids)), counter_names_(counter_names) {}
+  PerfCounters() = default;
+
+  void CloseCounters() const;
 
   std::vector<int> counter_ids_;
-  const std::vector<std::string> counter_names_;
-  const bool is_valid_;
+  std::vector<std::string> counter_names_;
 };
 
 // Typical usage of the above primitives.
 class PerfCountersMeasurement final {
  public:
-  PerfCountersMeasurement(PerfCounters&& c)
-      : counters_(std::move(c)),
-        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
-        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
-
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+  ~PerfCountersMeasurement();
+
+  // The only way to get to `counters_` is after ctor-ing a
+  // `PerfCountersMeasurement`, which means that `counters_`'s state is, here,
+  // decided (either invalid or valid) and won't change again even if a ctor is
+  // concurrently running with this. This is preferring efficiency to
+  // maintainability, because the address of the static can be known at compile
+  // time.
   bool IsValid() const { return counters_.IsValid(); }
 
   BENCHMARK_ALWAYS_INLINE void Start() {
@@ -136,30 +143,33 @@ class PerfCountersMeasurement final {
     // Tell the compiler to not move instructions above/below where we take
     // the snapshot.
     ClobberMemory();
-    counters_.Snapshot(&start_values_);
+    valid_read_ &= counters_.Snapshot(&start_values_);
     ClobberMemory();
   }
 
-  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
-  StopAndGetMeasurements() {
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
     assert(IsValid());
     // Tell the compiler to not move instructions above/below where we take
     // the snapshot.
     ClobberMemory();
-    counters_.Snapshot(&end_values_);
+    valid_read_ &= counters_.Snapshot(&end_values_);
     ClobberMemory();
 
-    std::vector<std::pair<std::string, double>> ret;
     for (size_t i = 0; i < counters_.names().size(); ++i) {
       double measurement = static_cast<double>(end_values_[i]) -
                            static_cast<double>(start_values_[i]);
-      ret.push_back({counters_.names()[i], measurement});
+      measurements.push_back({counters_.names()[i], measurement});
     }
-    return ret;
+
+    return valid_read_;
   }
 
  private:
-  PerfCounters counters_;
+  static Mutex mutex_;
+  GUARDED_BY(mutex_) static int ref_count_;
+  GUARDED_BY(mutex_) static PerfCounters counters_;
+  bool valid_read_ = true;
   PerfCounterValues start_values_;
   PerfCounterValues end_values_;
 };
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc
index 3eac624..f9e6a6f 100644
--- a/test/perf_counters_gtest.cc
+++ b/test/perf_counters_gtest.cc
@@ -11,6 +11,7 @@ struct MsgHandler {
 #endif
 
 using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCountersMeasurement;
 using benchmark::internal::PerfCounterValues;
 
 namespace {
@@ -95,6 +96,53 @@ TEST(PerfCountersTest, Read2Counters) {
   EXPECT_GT(values2[1], 0);
 }
 
+TEST(PerfCountersTest, ReopenExistingCounters) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<PerfCounters> counters;
+  counters.reserve(6);
+  for (int i = 0; i < 6; i++)
+    counters.push_back(PerfCounters::Create({kGenericPerfEvent1}));
+  PerfCounterValues values(1);
+  EXPECT_TRUE(counters[0].Snapshot(&values));
+  EXPECT_FALSE(counters[4].Snapshot(&values));
+  EXPECT_FALSE(counters[5].Snapshot(&values));
+}
+
+TEST(PerfCountersTest, CreateExistingMeasurements) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<PerfCountersMeasurement> perf_counter_measurements;
+  std::vector<std::pair<std::string, double>> measurements;
+
+  perf_counter_measurements.reserve(10);
+  for (int i = 0; i < 10; i++)
+    perf_counter_measurements.emplace_back(
+        std::vector<std::string>{kGenericPerfEvent1});
+
+  perf_counter_measurements[0].Start();
+  EXPECT_TRUE(perf_counter_measurements[0].Stop(measurements));
+
+  measurements.clear();
+  perf_counter_measurements[8].Start();
+  EXPECT_FALSE(perf_counter_measurements[8].Stop(measurements));
+
+  measurements.clear();
+  perf_counter_measurements[9].Start();
+  EXPECT_FALSE(perf_counter_measurements[9].Stop(measurements));
+}
+
 size_t do_work() {
   size_t res = 0;
   for (size_t i = 0; i < 100000000; ++i) res += i * i;
-- 
cgit v1.2.3