From 762038c5e1cd79b6d9529bae5a63d30640602df7 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 23 Aug 2022 11:08:41 -0700 Subject: Aarch32 filterbank-accumulate assembly PiperOrigin-RevId: 469506037 --- BUILD.bazel | 6 ++- CMakeLists.txt | 8 +-- bench/u32-filterbank-accumulate.cc | 6 ++- src/u32-filterbank-accumulate/aarch32-neon-x1.S | 58 +++++++++++++++++++++ src/u32-filterbank-accumulate/aarch32-neon-x2.S | 69 +++++++++++++++++++++++++ src/xnnpack/filterbank.h | 2 + test/u32-filterbank-accumulate.cc | 64 ++++++++++++++++------- test/u32-filterbank-accumulate.yaml | 4 ++ tools/generate-filterbank-accumulate-test.py | 16 +++--- 9 files changed, 201 insertions(+), 32 deletions(-) create mode 100644 src/u32-filterbank-accumulate/aarch32-neon-x1.S create mode 100644 src/u32-filterbank-accumulate/aarch32-neon-x2.S diff --git a/BUILD.bazel b/BUILD.bazel index 63feeb4ec..e0efc26b1 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -618,11 +618,11 @@ PROD_SCALAR_RISCV_MICROKERNEL_SRCS = [ ] ALL_SCALAR_MICROKERNEL_SRCS = [ - "src/cs16-bfly4/scalar-m1.c", "src/cs16-bfly4/gen/scalar-x1.c", "src/cs16-bfly4/gen/scalar-x2.c", "src/cs16-bfly4/gen/scalar-x3.c", "src/cs16-bfly4/gen/scalar-x4.c", + "src/cs16-bfly4/scalar-m1.c", "src/cs16-fftr/gen/scalar-x1.c", "src/cs16-fftr/gen/scalar-x2.c", "src/cs16-fftr/gen/scalar-x3.c", @@ -1050,8 +1050,8 @@ ALL_SCALAR_MICROKERNEL_SRCS = [ "src/math/sqrt-u32-scalar-clz-newton.c", "src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c", "src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c", - "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c", "src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c", + "src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c", "src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c", "src/math/sqrt-u32-scalar-hashemian.c", "src/math/sqrt-u32-scalar-tflm.c", @@ -8309,6 +8309,8 @@ AARCH32_ASM_MICROKERNEL_SRCS = [ "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", + "src/u32-filterbank-accumulate/aarch32-neon-x1.S", + "src/u32-filterbank-accumulate/aarch32-neon-x2.S", ] AARCH64_ASM_MICROKERNEL_SRCS = [ diff --git a/CMakeLists.txt b/CMakeLists.txt index 664c0e95e..2ff87f4ed 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -606,11 +606,11 @@ SET(PROD_SCALAR_RISCV_MICROKERNEL_SRCS src/x32-zip/x4-scalar.c) SET(ALL_SCALAR_MICROKERNEL_SRCS - src/cs16-bfly4/scalar-m1.c src/cs16-bfly4/gen/scalar-x1.c src/cs16-bfly4/gen/scalar-x2.c src/cs16-bfly4/gen/scalar-x3.c src/cs16-bfly4/gen/scalar-x4.c + src/cs16-bfly4/scalar-m1.c src/cs16-fftr/gen/scalar-x1.c src/cs16-fftr/gen/scalar-x2.c src/cs16-fftr/gen/scalar-x3.c @@ -1038,8 +1038,8 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS src/math/sqrt-u32-scalar-clz-newton.c src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c - src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c + src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c src/math/sqrt-u32-scalar-hashemian.c src/math/sqrt-u32-scalar-tflm.c @@ -6750,7 +6750,9 @@ SET(AARCH32_ASM_MICROKERNEL_SRCS src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S - src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S) + src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S + src/u32-filterbank-accumulate/aarch32-neon-x1.S + src/u32-filterbank-accumulate/aarch32-neon-x2.S) SET(AARCH64_ASM_MICROKERNEL_SRCS src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S diff --git a/bench/u32-filterbank-accumulate.cc b/bench/u32-filterbank-accumulate.cc index a5c0a9a52..ba064f945 100644 --- a/bench/u32-filterbank-accumulate.cc +++ b/bench/u32-filterbank-accumulate.cc @@ -53,7 +53,6 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) { b->ArgNames({"rows", "batch"}); b->Args({1, 237}); - b->Args({5, 1}); b->Args({10, 2}); b->Args({7, 3}); @@ -68,6 +67,11 @@ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b) b->Args({1, 13}); } +#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +BENCHMARK_CAPTURE(filterbank_accumulate, u32_aarch32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); +#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY + #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x1, xnn_u32_filterbank_accumulate_ukernel__neon_x1, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); BENCHMARK_CAPTURE(filterbank_accumulate, u32_neon_x2, xnn_u32_filterbank_accumulate_ukernel__neon_x2, benchmark::utils::CheckNEON)->Apply(BenchmarkKernelSize)->UseRealTime(); diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x1.S b/src/u32-filterbank-accumulate/aarch32-neon-x1.S new file mode 100644 index 000000000..bd5ad932a --- /dev/null +++ b/src/u32-filterbank-accumulate/aarch32-neon-x1.S @@ -0,0 +1,58 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +.syntax unified + +// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1( +// size_t rows, r0 +// const uint32_t* input, r1 +// const uint8_t* weight_widths, r2 +// const uint16_t* weights, r3 +// uint64_t* output) sp -> r12 + +// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. + +// Register usage +// input r1 d2 +// weights r3 d3 d4 d5 +// output r12 d0 d1 + +// weight_widths r2 r4 + +BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 + .arm +#ifndef __APPLE__ + .arch armv7-a + .fpu neon +#endif + LDR r12, [sp] // output + PUSH {r4} // push 4 bytes + VMOV.U8 q0, #0 // weight_accumulator +0: + LDRB r4, [r2], #1 // weight_widths + +1: + VLD1.32 {d3[]}, [r3]! // weights + VLD1.32 {d2[]}, [r1]! // input + SUBS r4, r4, #1 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d4, d2 + BHI 1b + + VST1.64 {d0}, [r12]! + SUBS r0, r0, #1 + VMOV d0, d1 + BNE 0b + + POP {r4} + BX lr + +END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 + +#ifdef __ELF__ +.section ".note.GNU-stack","",%progbits +#endif diff --git a/src/u32-filterbank-accumulate/aarch32-neon-x2.S b/src/u32-filterbank-accumulate/aarch32-neon-x2.S new file mode 100644 index 000000000..5c8cc3a00 --- /dev/null +++ b/src/u32-filterbank-accumulate/aarch32-neon-x2.S @@ -0,0 +1,69 @@ +// Copyright 2022 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +.syntax unified + +// void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2( +// size_t rows, r0 +// const uint32_t* input, r1 +// const uint8_t* weight_widths, r2 +// const uint16_t* weights, r3 +// uint64_t* output) sp -> r12 + +// d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. + +// Register usage +// input r1 d2 +// weights r3 d3 d4 d5 +// output r12 d0 d1 + +// weight_widths r2 r4 + +BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 + .arm +#ifndef __APPLE__ + .arch armv7-a + .fpu neon +#endif + LDR r12, [sp] // output + PUSH {r4} // push 4 bytes + VMOV.U8 q0, #0 // weight_accumulator +0: + LDRB r4, [r2], #1 // weight_widths + SUBS r4, r4, #1 + BLS 2f // less than 2 weights? + +1: + VLD1.16 {d3}, [r3]! // weights + VLD1.32 {d2}, [r1]! // input + SUBS r4, r4, #2 + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d4, d2[0] + VMLAL.U32 q0, d5, d2[1] + BHI 1b + + BLO 3f // is there a remainder? +2: + VLD1.32 {d3[]}, [r3]! // weights + VLD1.32 {d2[]}, [r1]! // input + VMOVL.U16 q2, d3 + VMLAL.U32 q0, d4, d2 + +3: + VST1.64 {d0}, [r12]! + SUBS r0, r0, #1 + VMOV d0, d1 + BNE 0b + + POP {r4} + BX lr + +END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 + +#ifdef __ELF__ +.section ".note.GNU-stack","",%progbits +#endif diff --git a/src/xnnpack/filterbank.h b/src/xnnpack/filterbank.h index 90ce94051..f6d7a59ef 100644 --- a/src/xnnpack/filterbank.h +++ b/src/xnnpack/filterbank.h @@ -23,6 +23,8 @@ extern "C" { const uint16_t* weights, \ uint64_t* output); +DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1) +DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2) DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x1) DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__neon_x2) DECLARE_U32_FILTERBANK_ACCUMULATE_UKERNEL_FUNCTION(xnn_u32_filterbank_accumulate_ukernel__scalar_x1) diff --git a/test/u32-filterbank-accumulate.cc b/test/u32-filterbank-accumulate.cc index a1c0fd5d8..4929af500 100644 --- a/test/u32-filterbank-accumulate.cc +++ b/test/u32-filterbank-accumulate.cc @@ -17,6 +17,40 @@ #include "filterbank-accumulate-microkernel-tester.h" +#if XNN_ARCH_ARM + TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X1, rows_eq_1) { + TEST_REQUIRES_ARM_NEON; + FilterbankAccumulateMicrokernelTester() + .rows(1) + .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1); + } + + TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X1, rows_eq_2) { + TEST_REQUIRES_ARM_NEON; + FilterbankAccumulateMicrokernelTester() + .rows(2) + .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1); + } +#endif // XNN_ARCH_ARM + + +#if XNN_ARCH_ARM + TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X2, rows_eq_1) { + TEST_REQUIRES_ARM_NEON; + FilterbankAccumulateMicrokernelTester() + .rows(1) + .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2); + } + + TEST(U32_FILTERBANK_ACCUMULATE__AARCH32_NEON_X2, rows_eq_2) { + TEST_REQUIRES_ARM_NEON; + FilterbankAccumulateMicrokernelTester() + .rows(2) + .Test(xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2); + } +#endif // XNN_ARCH_ARM + + #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_1) { TEST_REQUIRES_ARM_NEON; @@ -25,13 +59,11 @@ .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1); } - TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_gt_1) { + TEST(U32_FILTERBANK_ACCUMULATE__NEON_X1, rows_eq_2) { TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows <= 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(2) - .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1); - } + FilterbankAccumulateMicrokernelTester() + .rows(2) + .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x1); } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -44,13 +76,11 @@ .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2); } - TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_gt_1) { + TEST(U32_FILTERBANK_ACCUMULATE__NEON_X2, rows_eq_2) { TEST_REQUIRES_ARM_NEON; - for (size_t rows = 2; rows <= 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(2) - .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2); - } + FilterbankAccumulateMicrokernelTester() + .rows(2) + .Test(xnn_u32_filterbank_accumulate_ukernel__neon_x2); } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -61,10 +91,8 @@ TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_1) { .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1); } -TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_gt_1) { - for (size_t rows = 2; rows <= 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(2) - .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1); - } +TEST(U32_FILTERBANK_ACCUMULATE__SCALAR_X1, rows_eq_2) { + FilterbankAccumulateMicrokernelTester() + .rows(2) + .Test(xnn_u32_filterbank_accumulate_ukernel__scalar_x1); } diff --git a/test/u32-filterbank-accumulate.yaml b/test/u32-filterbank-accumulate.yaml index c5cd045be..54798504c 100644 --- a/test/u32-filterbank-accumulate.yaml +++ b/test/u32-filterbank-accumulate.yaml @@ -4,6 +4,10 @@ # LICENSE file in the root directory of this source tree. +# AArch32 assembly +- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x1 +- name: xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 + # ARM NEON - name: xnn_u32_filterbank_accumulate_ukernel__neon_x1 - name: xnn_u32_filterbank_accumulate_ukernel__neon_x2 diff --git a/tools/generate-filterbank-accumulate-test.py b/tools/generate-filterbank-accumulate-test.py index 2a06a9396..2139a897e 100755 --- a/tools/generate-filterbank-accumulate-test.py +++ b/tools/generate-filterbank-accumulate-test.py @@ -27,10 +27,12 @@ parser.set_defaults(defines=list()) def split_ukernel_name(name): - match = re.fullmatch(r"xnn_u32_filterbank_accumulate_ukernel__(.+)_x(\d+)", name) + match = re.fullmatch(r"xnn_u32_filterbank_accumulate_ukernel__(.+)(_x(\d+))?", name) assert match is not None row_tile = 1 - batch_tile = int(match.group(2)) + batch_tile = 1 + if match.group(3): + batch_tile = int(match.group(3)) arch, isa = xnncommon.parse_target_name(target_name=match.group(1)) return row_tile, batch_tile, arch, isa @@ -45,14 +47,12 @@ TEST(${TEST_NAME}, rows_eq_1) { .Test(${", ".join(TEST_ARGS)}); } -TEST(${TEST_NAME}, rows_gt_1) { +TEST(${TEST_NAME}, rows_eq_2) { $if ISA_CHECK: ${ISA_CHECK}; - for (size_t rows = 2; rows <= 10; rows++) { - FilterbankAccumulateMicrokernelTester() - .rows(2) - .Test(${", ".join(TEST_ARGS)}); - } + FilterbankAccumulateMicrokernelTester() + .rows(2) + .Test(${", ".join(TEST_ARGS)}); } -- cgit v1.2.3