aboutsummaryrefslogtreecommitdiff
path: root/bench/qs8-gemm.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2020-10-12 11:55:18 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2020-10-12 11:55:51 -0700
commit31328cb16419a5950d65c540d210fbf45908586e (patch)
tree83fb424a5120000bae24767a32a06c09284fac1c /bench/qs8-gemm.cc
parent0797eb17d3a9b30ec447e8ee4ac6e02c300730d8 (diff)
downloadXNNPACK-31328cb16419a5950d65c540d210fbf45908586e.tar.gz
Add RUY benchmark to qs8_gemm_bench
PiperOrigin-RevId: 336711804
Diffstat (limited to 'bench/qs8-gemm.cc')
-rw-r--r--bench/qs8-gemm.cc96
1 files changed, 95 insertions, 1 deletions
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 7e44f7b04..9d6cdd3ab 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -16,6 +16,9 @@
#include <cpuinfo.h>
#include <benchmark/benchmark.h>
+#ifdef BENCHMARK_RUY
+#include "ruy/ruy.h"
+#endif // BENCHMARK_RUY
#include "bench/gemm.h"
#include "bench/utils.h"
#include <xnnpack/AlignedAllocator.h>
@@ -25,7 +28,6 @@
#include <xnnpack/params-init.h>
#include <xnnpack/params.h>
-
static void GEMMBenchmark(benchmark::State& state,
xnn_qs8_gemm_ukernel_function gemm,
size_t mr, size_t nr, size_t kr, size_t sr,
@@ -180,6 +182,94 @@ static void GEMMBenchmark(benchmark::State& state,
uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
}
+#ifdef BENCHMARK_RUY
+static void RuyBenchmark(benchmark::State& state, size_t threads)
+{
+ const size_t mc = state.range(0);
+ const size_t nc = state.range(1);
+ const size_t kc = state.range(2);
+
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
+ auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
+
+ const size_t num_buffers = 1 +
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+ nc * (sizeof(int8_t) * (mc + kc) + sizeof(int32_t)));
+
+ std::vector<int8_t> a(mc * kc);
+ std::generate(a.begin(), a.end(), std::ref(u8rng));
+ std::vector<int8_t> k(num_buffers * nc * kc);
+ std::generate(k.begin(), k.end(), std::ref(u8rng));
+ std::vector<int32_t> b(num_buffers * nc);
+ std::generate(b.begin(), b.end(), std::ref(i32rng));
+ std::vector<int8_t> c(num_buffers * nc * mc);
+ std::fill(c.begin(), c.end(), std::nanf(""));
+
+ // Note: context must be static to avoid the cost of re-creating it for each benchmark.
+ static ruy::Context context;
+ context.set_max_num_threads(threads);
+
+ ruy::Matrix<int8_t> ruy_a;
+ ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
+ ruy_a.set_zero_point(127);
+ ruy::Matrix<int8_t> ruy_b;
+ ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
+ ruy_b.set_data(a.data());
+ ruy_b.set_zero_point(127);
+ ruy::Matrix<int8_t> ruy_c;
+ ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
+ ruy_c.set_zero_point(127);
+
+ ruy::MulParams<int32_t, int8_t> mul_params;
+ mul_params.set_multiplier_fixedpoint(0x40000000);
+
+ // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
+ // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
+ // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
+ // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
+ static std::once_flag warmup;
+ std::call_once(warmup, [&](){
+ auto start = std::chrono::steady_clock::now();
+ do {
+ ruy_a.set_data(k.data());
+ ruy_c.set_data(c.data());
+ mul_params.set_bias(b.data());
+
+ ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
+ } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
+ });
+
+ size_t buffer_index = 0;
+ for (auto _ : state) {
+ // Use circular buffers (exceeding cache size) and prefetch to control cache state:
+ // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
+ // - K is not in cache (for any cache level)
+ // - B is not in cache (for any cache level)
+ // - C is not in cache (for any cache level)
+ state.PauseTiming();
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
+ buffer_index = (buffer_index + 1) % num_buffers;
+ state.ResumeTiming();
+
+ ruy_a.set_data(k.data() + buffer_index * nc * kc);
+ ruy_c.set_data(c.data() + buffer_index * mc * nc);
+ mul_params.set_bias(b.data() + buffer_index * nc);
+
+ ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
+ }
+
+ state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+ state.counters["OPS"] = benchmark::Counter(
+ uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
+}
+
+static void ruy_st(benchmark::State& state, const char* net)
+{
+ RuyBenchmark(state, 1);
+}
+#endif // BENCHMARK_RUY
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
@@ -464,6 +554,10 @@ static void GEMMBenchmark(benchmark::State& state,
BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__wasmsimd)
#endif // XNN_ARCH_WASMSIMD
+#ifdef BENCHMARK_RUY
+BENCHMARK_GEMM(ruy_st)
+#endif // BENCHMARK_RUY
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif