aboutsummaryrefslogtreecommitdiff
path: root/src/benchmark_runner.cc
diff options
context:
space:
mode:
authorMircea Trofin <mtrofin@google.com>2021-04-28 01:25:29 -0700
committerGitHub <noreply@github.com>2021-04-28 09:25:29 +0100
commit376ebc26354ca2b79af94467133f3c35b539627e (patch)
treef6b9b93ba9a93945fd58a1dd948b58b77754af32 /src/benchmark_runner.cc
parent835951aa44c2f802b4d563d533eac34565848eb0 (diff)
downloadgoogle-benchmark-376ebc26354ca2b79af94467133f3c35b539627e.tar.gz
Support optional, user-directed collection of performance counters (#1114)
* Support optional, user-directed collection of performance counters The patch allows an engineer wishing to drill into the root causes of a regression, for example. Currently, only single threaded runs are supported. The feature is a build-time opt in, and then a runtime opt in. The engineer may run the benchmark executable, passing a list of performance counter names (using libpfm's naming scheme) at the command line. The counter values will then be collected and reported back as UserCounters. This is different from #240 in that it is a benchmark user opt-in, and the counter collection is transparent to the benchmark. Currently, this is only supported on platforms where libpfm is supported. libpfm: http://perfmon2.sourceforge.net/ * 'Use' values param in Snapshot when BENCHMARK_OS_WINDOWS This is to avoid unused parameter warning-as-error * Added missing include for <vector> in perf_counters.cc * Moved doc to docs * Added license blurbs
Diffstat (limited to 'src/benchmark_runner.cc')
-rw-r--r--src/benchmark_runner.cc29
1 files changed, 23 insertions, 6 deletions
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index d081aa8..083d184 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -45,6 +45,7 @@
#include "internal_macros.h"
#include "log.h"
#include "mutex.h"
+#include "perf_counters.h"
#include "re.h"
#include "statistics.h"
#include "string_util.h"
@@ -111,12 +112,14 @@ BenchmarkReporter::Run CreateRunReport(
// Execute one thread of benchmark b for the specified number of iterations.
// Adds the stats collected for the thread into manager->results.
void RunInThread(const BenchmarkInstance* b, IterationCount iters,
- int thread_id, ThreadManager* manager) {
+ int thread_id, ThreadManager* manager,
+ PerfCountersMeasurement* perf_counters_measurement) {
internal::ThreadTimer timer(
b->measure_process_cpu_time
? internal::ThreadTimer::CreateProcessCpuTime()
: internal::ThreadTimer::Create());
- State st = b->Run(iters, thread_id, &timer, manager);
+ State st =
+ b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
<< "Benchmark returned before State::KeepRunning() returned false!";
{
@@ -143,7 +146,12 @@ class BenchmarkRunner {
: FLAGS_benchmark_repetitions),
has_explicit_iteration_count(b.iterations != 0),
pool(b.threads - 1),
- iters(has_explicit_iteration_count ? b.iterations : 1) {
+ iters(has_explicit_iteration_count ? b.iterations : 1),
+ perf_counters_measurement(
+ PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+ perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
+ ? &perf_counters_measurement
+ : nullptr) {
run_results.display_report_aggregates_only =
(FLAGS_benchmark_report_aggregates_only ||
FLAGS_benchmark_display_aggregates_only);
@@ -155,6 +163,11 @@ class BenchmarkRunner {
internal::ARM_DisplayReportAggregatesOnly);
run_results.file_report_aggregates_only =
(b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
+ CHECK(b.threads == 1 || !perf_counters_measurement.IsValid())
+ << "Perf counters are not supported in multi-threaded cases.\n";
+ CHECK(FLAGS_benchmark_perf_counters.empty() ||
+ perf_counters_measurement.IsValid())
+ << "Perf counters were requested but could not be set up.";
}
for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
@@ -192,6 +205,9 @@ class BenchmarkRunner {
// So only the first repetition has to find/calculate it,
// the other repetitions will just use that precomputed iteration count.
+ PerfCountersMeasurement perf_counters_measurement;
+ PerfCountersMeasurement* const perf_counters_measurement_ptr;
+
struct IterationResults {
internal::ThreadManager::Result results;
IterationCount iters;
@@ -206,12 +222,12 @@ class BenchmarkRunner {
// Run all but one thread in separate threads
for (std::size_t ti = 0; ti < pool.size(); ++ti) {
pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
- manager.get());
+ manager.get(), perf_counters_measurement_ptr);
}
// And run one thread here directly.
// (If we were asked to run just one thread, we don't create new threads.)
// Yes, we need to do this here *after* we start the separate threads.
- RunInThread(&b, iters, 0, manager.get());
+ RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
// The main thread has finished. Now let's wait for the other threads.
manager->WaitForAllThreads();
@@ -331,7 +347,8 @@ class BenchmarkRunner {
memory_manager->Start();
std::unique_ptr<internal::ThreadManager> manager;
manager.reset(new internal::ThreadManager(1));
- RunInThread(&b, memory_iterations, 0, manager.get());
+ RunInThread(&b, memory_iterations, 0, manager.get(),
+ perf_counters_measurement_ptr);
manager->WaitForAllThreads();
manager.reset();