diff options
author | Martin Storsjö <martin@martin.st> | 2024-03-28 11:30:41 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2024-04-02 10:35:29 +0000 |
commit | 5e31720b8902ec9bcf1f3aaa9a135ee34b58af30 (patch) | |
tree | 789461c03738fdc5c1d319c54556603415f57388 | |
parent | abc8a1689fbefec880bb3c0064c66afcb1e9d4b9 (diff) | |
download | libdav1d-5e31720b8902ec9bcf1f3aaa9a135ee34b58af30.tar.gz |
checkasm: Add support for the private macOS kperf API for benchmarking
On AArch64, the performance counter registers usually are
restricted and not accessible from user space.
On macOS, we currently use mach_absolute_time() as timer on
aarch64. This measures wallclock time but with a very coarse
resolution.
There is a private API, kperf, that one can use for getting
high precision timers though. Unfortunately, it requires running
the checkasm binary as root (e.g. with sudo).
Also, as it is a private, undocumented API, it can potentially
change at any time.
This is handled by adding a new meson build option, for switching
to this timer. If the timer source in checkasm could be changed
at runtime with an option, this wouldn't need to be a build time
option.
This allows getting benchmarks like this:
mc_8tap_regular_w16_hv_8bpc_c: 1522.1 ( 1.00x)
mc_8tap_regular_w16_hv_8bpc_neon: 331.8 ( 4.59x)
Instead of this:
mc_8tap_regular_w16_hv_8bpc_c: 9.0 ( 1.00x)
mc_8tap_regular_w16_hv_8bpc_neon: 1.9 ( 4.76x)
Co-authored-by: J. Dekker <jdek@itanimul.li>
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | meson_options.txt | 5 | ||||
-rw-r--r-- | tests/checkasm/checkasm.c | 83 | ||||
-rw-r--r-- | tests/checkasm/checkasm.h | 3 |
4 files changed, 93 insertions, 0 deletions
diff --git a/meson.build b/meson.build index e371415..a2637ed 100644 --- a/meson.build +++ b/meson.build @@ -81,6 +81,8 @@ cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or # Logging option cdata.set10('CONFIG_LOG', get_option('logging')) +cdata.set10('CONFIG_MACOS_KPERF', get_option('macos_kperf')) + # # OS/Compiler checks and defines # diff --git a/meson_options.txt b/meson_options.txt index c04deff..b0b45b4 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -68,3 +68,8 @@ option('trim_dsp', choices: ['true', 'false', 'if-release'], value: 'if-release', description: 'Eliminate redundant DSP functions where possible') + +option('macos_kperf', + type: 'boolean', + value: false, + description: 'Use the private macOS kperf API for benchmarking') diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 9a01da7..fd11c0d 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -54,6 +54,9 @@ #include <mach/mach_time.h> #endif #endif +#if CONFIG_MACOS_KPERF +#include <dlfcn.h> +#endif #define COLOR_RED 31 #define COLOR_GREEN 32 @@ -206,6 +209,82 @@ int xor128_rand(void) { return w >> 1; } +#if CONFIG_MACOS_KPERF + +static int (*kpc_get_thread_counters)(int, unsigned int, void *); + +#define CFGWORD_EL0A64EN_MASK (0x20000) + +#define CPMU_CORE_CYCLE 0x02 + +#define KPC_CLASS_FIXED_MASK (1 << 0) +#define KPC_CLASS_CONFIGURABLE_MASK (1 << 1) + +#define COUNTERS_COUNT 10 +#define CONFIG_COUNT 8 +#define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK) + +static int kperf_init(void) { + uint64_t config[COUNTERS_COUNT] = { 0 }; + + void *kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/kperf", RTLD_LAZY); + if (!kperf) { + fprintf(stderr, "checkasm: Unable to load kperf: %s\n", dlerror()); + return 1; + } + + int (*kpc_force_all_ctrs_set)(int) = dlsym(kperf, "kpc_force_all_ctrs_set"); + int (*kpc_set_counting)(uint32_t) = dlsym(kperf, "kpc_set_counting"); + int (*kpc_set_thread_counting)(uint32_t) = dlsym(kperf, "kpc_set_thread_counting"); + int (*kpc_set_config)(uint32_t, void *) = dlsym(kperf, "kpc_set_config"); + uint32_t (*kpc_get_counter_count)(uint32_t) = dlsym(kperf, "kpc_get_counter_count"); + uint32_t (*kpc_get_config_count)(uint32_t) = dlsym(kperf, "kpc_get_config_count"); + kpc_get_thread_counters = dlsym(kperf, "kpc_get_thread_counters"); + + if (!kpc_get_thread_counters) { + fprintf(stderr, "checkasm: Unable to load kpc_get_thread_counters\n"); + return 1; + } + + if (!kpc_get_counter_count || kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) { + fprintf(stderr, "checkasm: Unxpected kpc_get_counter_count\n"); + return 1; + } + if (!kpc_get_config_count || kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) { + fprintf(stderr, "checkasm: Unxpected kpc_get_config_count\n"); + return 1; + } + + config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK; + + if (!kpc_set_config || kpc_set_config(KPC_MASK, config)) { + fprintf(stderr, "checkasm: The kperf API needs to be run as root\n"); + return 1; + } + if (!kpc_force_all_ctrs_set || kpc_force_all_ctrs_set(1)) { + fprintf(stderr, "checkasm: kpc_force_all_ctrs_set failed\n"); + return 1; + } + if (!kpc_set_counting || kpc_set_counting(KPC_MASK)) { + fprintf(stderr, "checkasm: kpc_set_counting failed\n"); + return 1; + } + if (!kpc_set_counting || kpc_set_thread_counting(KPC_MASK)) { + fprintf(stderr, "checkasm: kpc_set_thread_counting failed\n"); + return 1; + } + return 0; +} + +uint64_t checkasm_kperf_cycles(void) { + uint64_t counters[COUNTERS_COUNT]; + if (kpc_get_thread_counters(0, COUNTERS_COUNT, counters)) + return -1; + + return counters[0]; +} +#endif + static int is_negative(const intfloat u) { return u.i >> 31; } @@ -714,6 +793,10 @@ int main(int argc, char *argv[]) { #ifdef readtime if (state.run_mode == RUN_BENCHMARK) { +#if CONFIG_MACOS_KPERF + if (kperf_init()) + return 1; +#endif if (!checkasm_save_context()) { checkasm_set_signal_handler_state(1); readtime(); diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index eeda5df..8baeec6 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -146,6 +146,9 @@ static inline uint64_t readtime(void) { } #define readtime readtime #endif +#elif CONFIG_MACOS_KPERF +uint64_t checkasm_kperf_cycles(void); +#define readtime() checkasm_kperf_cycles() #elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__) #include <mach/mach_time.h> #define readtime() mach_absolute_time() |