aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-02-03 02:21:50 -0800
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-02-03 02:22:48 -0800
commit1d5c616f65d1206bd9823fea53daa174650577da (patch)
tree3a20d406078627b140a3a29f8107b5052a545f14
parent94a0b0bc75b5d2c11c2aa7368d3428ca546dee73 (diff)
downloadXNNPACK-1d5c616f65d1206bd9823fea53daa174650577da.tar.gz
Enable QU8 AAarch microkernels based on uarch
- based on initialization used for QS8, select the same microkernels for QU8 when available - dot product use 4x8 intrinsics. Was 2x16. PiperOrigin-RevId: 426091572
-rw-r--r--BUILD.bazel4
-rwxr-xr-xCMakeLists.txt48
-rw-r--r--src/init.c149
3 files changed, 157 insertions, 44 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 4f2b76c81..67249df21 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4549,11 +4549,11 @@ PROD_NEONDOT_MICROKERNEL_SRCS = [
"src/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
- "src/qu8-gemm/gen/2x16c4-minmax-rndnu-neondot.c",
+ "src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
- "src/qu8-igemm/gen/2x16c4-minmax-rndnu-neondot.c",
+ "src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37d088cc1..1d4fb8925 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3282,30 +3282,30 @@ SET(ALL_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
src/f16-vmulcaddc/gen/c16-minmax-neonfp16arith-2x.c)
SET(PROD_NEONDOT_MICROKERNEL_SRCS
- src/qc8-gemm/gen/1x8c4-minmax-fp32-neondot.c
- src/qc8-gemm/gen/1x16c4-minmax-fp32-neondot.c
- src/qc8-gemm/gen/4x8c4-minmax-fp32-neondot.c
- src/qc8-gemm/gen/4x16c4-minmax-fp32-neondot.c
- src/qc8-igemm/gen/1x8c4-minmax-fp32-neondot.c
- src/qc8-igemm/gen/1x16c4-minmax-fp32-neondot.c
- src/qc8-igemm/gen/4x8c4-minmax-fp32-neondot.c
- src/qc8-igemm/gen/4x16c4-minmax-fp32-neondot.c
- src/qs8-gemm/gen/1x8c4-minmax-rndnu-neondot.c
- src/qs8-gemm/gen/1x16c4-minmax-rndnu-neondot.c
- src/qs8-gemm/gen/4x8c4-minmax-rndnu-neondot.c
- src/qs8-gemm/gen/4x16c4-minmax-rndnu-neondot.c
- src/qs8-igemm/gen/1x8c4-minmax-rndnu-neondot.c
- src/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c
- src/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c
- src/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c
- src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c
- src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c
- src/qu8-gemm/gen/2x16c4-minmax-rndnu-neondot.c
- src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c
- src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c
- src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c
- src/qu8-igemm/gen/2x16c4-minmax-rndnu-neondot.c
- src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c)
+ src/qc8-gemm/gen/1x8c4-minmax-fp32-neondot.c
+ src/qc8-gemm/gen/1x16c4-minmax-fp32-neondot.c
+ src/qc8-gemm/gen/4x8c4-minmax-fp32-neondot.c
+ src/qc8-gemm/gen/4x16c4-minmax-fp32-neondot.c
+ src/qc8-igemm/gen/1x8c4-minmax-fp32-neondot.c
+ src/qc8-igemm/gen/1x16c4-minmax-fp32-neondot.c
+ src/qc8-igemm/gen/4x8c4-minmax-fp32-neondot.c
+ src/qc8-igemm/gen/4x16c4-minmax-fp32-neondot.c
+ src/qs8-gemm/gen/1x8c4-minmax-rndnu-neondot.c
+ src/qs8-gemm/gen/1x16c4-minmax-rndnu-neondot.c
+ src/qs8-gemm/gen/4x8c4-minmax-rndnu-neondot.c
+ src/qs8-gemm/gen/4x16c4-minmax-rndnu-neondot.c
+ src/qs8-igemm/gen/1x8c4-minmax-rndnu-neondot.c
+ src/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c
+ src/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c
+ src/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c)
SET(ALL_NEONDOT_MICROKERNEL_SRCS
src/qc8-gemm/gen/1x8c4-minmax-fp32-neondot.c
diff --git a/src/init.c b/src/init.c
index 4f5fbd7e7..d80d360ea 100644
--- a/src/init.c
+++ b/src/init.c
@@ -495,24 +495,137 @@ static void init(void) {
#ifndef XNN_NO_QU8_OPERATORS
init_flags |= XNN_INIT_FLAG_QU8;
- if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
- xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot);
- xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot);
- xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
- xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
- xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
- xnn_params.qu8.gemm.mr = 2;
- xnn_params.qu8.gemm.nr = 16;
- xnn_params.qu8.gemm.log2_kr = 2;
- } else {
- xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
- xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
- xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
- xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
- xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
- xnn_params.qu8.gemm.mr = 4;
- xnn_params.qu8.gemm.nr = 8;
- }
+ #if XNN_ENABLE_ASSEMBLY
+ if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ xnn_params.qu8.gemm.log2_kr = 2;
+ } else {
+ switch (cpuinfo_get_uarch(0)->uarch) {
+ case cpuinfo_uarch_cortex_a7:
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ break;
+ case cpuinfo_uarch_cortex_a35:
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ break;
+ case cpuinfo_uarch_cortex_a53:
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ break;
+ case cpuinfo_uarch_cortex_a55r0:
+ case cpuinfo_uarch_kryo:
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ break;
+ case cpuinfo_uarch_cortex_a72:
+ case cpuinfo_uarch_exynos_m1:
+ case cpuinfo_uarch_exynos_m2:
+ case cpuinfo_uarch_exynos_m3:
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ break;
+ default:
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ break;
+ }
+ }
+ #if XNN_MAX_UARCH_TYPES > 1
+ {
+ /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
+ const uint32_t mr = xnn_params.qu8.gemm.mr;
+ const uint32_t nr = xnn_params.qu8.gemm.nr;
+ const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
+ for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
+ const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
+ if (uarch_info == NULL) {
+ /* No more microarchitectures in the system */
+ break;
+ }
+
+ switch (uarch_info->uarch) {
+ case cpuinfo_uarch_cortex_a53:
+ if (mr == 4 && nr == 8 && log2_kr == 0) {
+ xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
+ xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
+ xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
+ xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
+ }
+ break;
+ case cpuinfo_uarch_cortex_a55r0:
+ if (mr == 4 && nr == 8 && log2_kr == 0) {
+ xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
+ xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
+ xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
+ xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ #endif // XNN_MAX_UARCH_TYPES > 1
+ #else // XNN_ENABLE_ASSEMBLY
+ if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 4;
+ xnn_params.qu8.gemm.nr = 8;
+ xnn_params.qu8.gemm.log2_kr = 2;
+ } else {
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
+ xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
+ xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
+ xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
+ xnn_params.qu8.gemm.mr = 2;
+ xnn_params.qu8.gemm.nr = 8;
+ xnn_params.qu8.gemm.log2_kr = 1;
+ xnn_params.qu8.gemm.log2_sr = 2;
+ }
+ #endif // XNN_ENABLE_ASSEMBLY
+
xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
xnn_params.qu8.dwconv[0].channel_tile = 16;