diff options
author | Frank Barchard <fbarchard@google.com> | 2020-10-12 11:55:18 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2020-10-12 11:55:51 -0700 |
commit | 31328cb16419a5950d65c540d210fbf45908586e (patch) | |
tree | 83fb424a5120000bae24767a32a06c09284fac1c /bench/qs8-gemm.cc | |
parent | 0797eb17d3a9b30ec447e8ee4ac6e02c300730d8 (diff) | |
download | XNNPACK-31328cb16419a5950d65c540d210fbf45908586e.tar.gz |
Add RUY benchmark to qs8_gemm_bench
PiperOrigin-RevId: 336711804
Diffstat (limited to 'bench/qs8-gemm.cc')
-rw-r--r-- | bench/qs8-gemm.cc | 96 |
1 files changed, 95 insertions, 1 deletions
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc index 7e44f7b04..9d6cdd3ab 100644 --- a/bench/qs8-gemm.cc +++ b/bench/qs8-gemm.cc @@ -16,6 +16,9 @@ #include <cpuinfo.h> #include <benchmark/benchmark.h> +#ifdef BENCHMARK_RUY +#include "ruy/ruy.h" +#endif // BENCHMARK_RUY #include "bench/gemm.h" #include "bench/utils.h" #include <xnnpack/AlignedAllocator.h> @@ -25,7 +28,6 @@ #include <xnnpack/params-init.h> #include <xnnpack/params.h> - static void GEMMBenchmark(benchmark::State& state, xnn_qs8_gemm_ukernel_function gemm, size_t mr, size_t nr, size_t kr, size_t sr, @@ -180,6 +182,94 @@ static void GEMMBenchmark(benchmark::State& state, uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); } +#ifdef BENCHMARK_RUY +static void RuyBenchmark(benchmark::State& state, size_t threads) +{ + const size_t mc = state.range(0); + const size_t nc = state.range(1); + const size_t kc = state.range(2); + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng)); + auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng)); + + const size_t num_buffers = 1 + + benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), + nc * (sizeof(int8_t) * (mc + kc) + sizeof(int32_t))); + + std::vector<int8_t> a(mc * kc); + std::generate(a.begin(), a.end(), std::ref(u8rng)); + std::vector<int8_t> k(num_buffers * nc * kc); + std::generate(k.begin(), k.end(), std::ref(u8rng)); + std::vector<int32_t> b(num_buffers * nc); + std::generate(b.begin(), b.end(), std::ref(i32rng)); + std::vector<int8_t> c(num_buffers * nc * mc); + std::fill(c.begin(), c.end(), std::nanf("")); + + // Note: context must be static to avoid the cost of re-creating it for each benchmark. + static ruy::Context context; + context.set_max_num_threads(threads); + + ruy::Matrix<int8_t> ruy_a; + ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout()); + ruy_a.set_zero_point(127); + ruy::Matrix<int8_t> ruy_b; + ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout()); + ruy_b.set_data(a.data()); + ruy_b.set_zero_point(127); + ruy::Matrix<int8_t> ruy_c; + ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout()); + ruy_c.set_zero_point(127); + + ruy::MulParams<int32_t, int8_t> mul_params; + mul_params.set_multiplier_fixedpoint(0x40000000); + + // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during + // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize. + // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and + // keep the ruy::Context object initialized (by being static) between subsequent benchmarks. + static std::once_flag warmup; + std::call_once(warmup, [&](){ + auto start = std::chrono::steady_clock::now(); + do { + ruy_a.set_data(k.data()); + ruy_c.set_data(c.data()); + mul_params.set_bias(b.data()); + + ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c); + } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5); + }); + + size_t buffer_index = 0; + for (auto _ : state) { + // Use circular buffers (exceeding cache size) and prefetch to control cache state: + // - A is always in L1 cache (if fits, otherwise L2, L3, etc) + // - K is not in cache (for any cache level) + // - B is not in cache (for any cache level) + // - C is not in cache (for any cache level) + state.PauseTiming(); + benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t)); + buffer_index = (buffer_index + 1) % num_buffers; + state.ResumeTiming(); + + ruy_a.set_data(k.data() + buffer_index * nc * kc); + ruy_c.set_data(c.data() + buffer_index * mc * nc); + mul_params.set_bias(b.data() + buffer_index * nc); + + ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c); + } + + state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency(); + state.counters["OPS"] = benchmark::Counter( + uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate); +} + +static void ruy_st(benchmark::State& state, const char* net) +{ + RuyBenchmark(state, 1); +} +#endif // BENCHMARK_RUY #if XNN_ARCH_ARM || XNN_ARCH_ARM64 static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) { @@ -464,6 +554,10 @@ static void GEMMBenchmark(benchmark::State& state, BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__wasmsimd) #endif // XNN_ARCH_WASMSIMD +#ifdef BENCHMARK_RUY +BENCHMARK_GEMM(ruy_st) +#endif // BENCHMARK_RUY + #ifndef XNNPACK_BENCHMARK_NO_MAIN BENCHMARK_MAIN(); #endif |