Refactor declarations of microkernel parameters

- Extract declarations of microkernel parameters into microparams.h - Group and document microkernel parameters - Rename params-init accordingly - Make microkernels depend only on microparams.h and not params.h PiperOrigin-RevId: 463747649
author: Marat Dukhan <maratek@google.com> 2022-07-27 21:14:38 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-07-27 21:15:38 -0700
commit: c836505ed4498a2ebd1c21050c383a0a60a8defc (patch)
tree: b26a80a0c5d6581794cc953414d7e05df7653ac9 /src/xnnpack
parent: 917e63588c2664a12417beb01e59f9e4a10251bc (diff)
download: XNNPACK-c836505ed4498a2ebd1c21050c383a0a60a8defc.tar.gz
40 files changed, 2506 insertions, 2455 deletions
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
index 3366f0c1a..c900ce1aa 100644
--- a/src/xnnpack/argmaxpool.h
+++ b/src/xnnpack/argmaxpool.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
index d766ac7b4..366986b79 100644
--- a/src/xnnpack/avgpool.h
+++ b/src/xnnpack/avgpool.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/conv.h b/src/xnnpack/conv.h
index 0b02beeaf..02a713417 100644
--- a/src/xnnpack/conv.h
+++ b/src/xnnpack/conv.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/depthtospace.h b/src/xnnpack/depthtospace.h
index 285fd3cbe..358b9df4c 100644
--- a/src/xnnpack/depthtospace.h
+++ b/src/xnnpack/depthtospace.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index 4b1464e1e..83ef6e13d 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/fill.h b/src/xnnpack/fill.h
index 0b62fea94..97cfd5007 100644
--- a/src/xnnpack/fill.h
+++ b/src/xnnpack/fill.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index fc5b153a4..bec595df8 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 519f51358..543ebfb10 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -11,8 +11,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <xnnpack.h>  // For xnn_status
+
 #include <xnnpack/common.h>
-#include <xnnpack/params.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index 12ecb605c..2744e404b 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 2ddd739f1..a79c3936d 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -11,8 +11,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <xnnpack.h>  // For xnn_status
+
 #include <xnnpack/common.h>
-#include <xnnpack/params.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/lut.h b/src/xnnpack/lut.h
index f11954e01..57d36412b 100644
--- a/src/xnnpack/lut.h
+++ b/src/xnnpack/lut.h
@@ -11,7 +11,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index 0310e77b7..a47c62531 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/microparams-init.h
index 3e5aa121d..3e5aa121d 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/microparams-init.h
diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h
new file mode 100644
index 000000000..9c6c3bb41
--- /dev/null
+++ b/src/xnnpack/microparams.h
@@ -0,0 +1,2481 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/common.h>
+
+
+// Default: serves to differentiate pointer types for micro-kernels without fused activation.
+
+union xnn_f16_default_params {
+  char _; // Dummy member variable to comply with the C standard
+};
+
+union xnn_f32_default_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    int32_t mask_table[14];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+
+// ReLU: serves to differentiate pointer types for micro-kernels with fused ReLU activation.
+
+union xnn_f32_relu_params {
+  char _; // Dummy member variable to comply with the C standard
+};
+
+
+// Scale+Min+Max: used by AVGPOOL/GAVGPOOL microkernels.
+
+union xnn_f16_scaleminmax_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t scale;
+    uint16_t min;
+    uint16_t max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float min[8];
+    XNN_ALIGN(32) float max[8];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_scaleminmax_params {
+  struct {
+    float scale;
+    float min;
+    float max;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float min[4];
+    XNN_ALIGN(16) float max[4];
+  } sse;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+
+// Min+Max: used by VCLAMP and GEMM/IGEMM/DWCONV/MAXPOOL/etc with MINMAX activation.
+
+union xnn_f16_minmax_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t min;
+    uint16_t max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float min[8];
+    XNN_ALIGN(32) float max[8];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_minmax_params {
+  struct {
+    float min;
+    float max;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float min[4];
+    XNN_ALIGN(16) float max[4];
+  } sse;
+  struct {
+    XNN_ALIGN(32) float min[8];
+    XNN_ALIGN(32) float max[8];
+    int32_t mask_table[14];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float min[2];
+    XNN_ALIGN(8) float max[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_s8_minmax_params {
+  struct {
+    int32_t min;
+    int32_t max;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint8_t bias[16];
+    XNN_ALIGN(16) uint8_t min_with_bias[16];
+    XNN_ALIGN(16) uint8_t max_with_bias[16];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int8_t min[16];
+    XNN_ALIGN(16) int8_t max[16];
+  } sse4;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int8_t min;
+    int8_t max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int8_t min[8];
+    XNN_ALIGN(8) int8_t max[8];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_u8_minmax_params {
+  struct {
+    uint32_t min;
+    uint32_t max;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint8_t min[16];
+    XNN_ALIGN(16) uint8_t max[16];
+  } sse2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint8_t min;
+    uint8_t max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) uint8_t min[8];
+    XNN_ALIGN(8) uint8_t max[8];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Conv Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX activation.
+
+union xnn_qs8_minmax_params {
+  struct {
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } scalar_imagic;
+  struct {
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } scalar_fmagic;
+  struct {
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar_lrintf;
+#if XNN_ARCH_ARM
+  struct {
+    float magic_bias;
+    int32_t magic_bias_less_zero_point;
+    uint32_t output_min;
+    uint32_t output_max;
+  } armv6simd;
+#endif  // XNN_ARCH_ARM
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } neon;
+  struct {
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } neonv8;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int16_t output_min[8];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+  } sse4;
+  struct {
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(32) int8_t output_min[32];
+  } avx2;
+  struct {
+    XNN_ALIGN(64) float output_max_less_zero_point[16];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(64) int8_t output_min[64];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qs8_conv_minmax_params {
+  struct {
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } fp32_scalar_fmagic;
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } fp32_scalar_imagic;
+  struct {
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } fp32_scalar_lrintf;
+#if XNN_ARCH_ARM
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_zero_point;
+    uint32_t output_min;
+    uint32_t output_max;
+  } fp32_armv6simd;
+#endif  // XNN_ARCH_ARM
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } fp32_neon;
+  struct {
+    float scale;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } fp32_neonv8;
+  struct {
+    int32_t right_pre_shift;
+    int32_t multiplier;
+    int32_t right_post_shift;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } rndnu_neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int16_t output_min[8];
+  } fp32_sse2;
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+  } fp32_sse4;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(32) int8_t output_min[32];
+  } fp32_avx2;
+  struct {
+    XNN_ALIGN(64) float scale[16];
+    XNN_ALIGN(64) float output_max_less_zero_point[16];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(64) int8_t output_min[64];
+  } fp32_avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } fp32_wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_conv_minmax_params {
+  struct {
+    int32_t kernel_zero_point;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } fp32_scalar_fmagic;
+  struct {
+    int32_t kernel_zero_point;
+    float scale;
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } fp32_scalar_imagic;
+  struct {
+    int32_t kernel_zero_point;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } fp32_scalar_lrintf;
+#if XNN_ARCH_ARM
+  struct {
+    float scale;
+    float magic_bias;
+    uint32_t minus_kernel_zero_point;
+    int32_t magic_bias_less_zero_point;
+    uint32_t output_min;
+    uint32_t output_max;
+  } fp32_armv6simd;
+#endif  // XNN_ARCH_ARM
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint8_t kernel_zero_point[4];
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } fp32_neon;
+  struct {
+    uint8_t kernel_zero_point[4];
+    float scale;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } fp32_neonv8;
+  struct {
+    uint8_t kernel_zero_point[4];
+    int32_t right_pre_shift;
+    int32_t multiplier;
+    int32_t right_post_shift;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } rndnu_neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t kernel_zero_point[8];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+  } fp32_sse2;
+  struct {
+    XNN_ALIGN(32) int16_t kernel_zero_point[16];
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(32) uint8_t output_min[32];
+  } fp32_avx2;
+  struct {
+    XNN_ALIGN(64) int16_t kernel_zero_point[32];
+    XNN_ALIGN(64) float scale[16];
+    XNN_ALIGN(64) float output_max_less_zero_point[16];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(64) uint8_t output_min[64];
+  } fp32_avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t kernel_zero_point[4];
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } fp32_wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Add/Sub Min+Max: used by quantized VADD[C] microkernels with MINMAX activation.
+
+union xnn_qs8_addsub_minmax_params {
+  struct {
+    int32_t bias;
+    int32_t a_multiplier;
+    int32_t b_multiplier;
+    uint32_t shift;
+    int32_t output_min_less_zero_point;
+    int32_t output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int8_t a_zero_point;
+    int8_t b_zero_point;
+    int16_t output_zero_point;
+    int32_t a_multiplier;
+    int32_t b_multiplier;
+    int32_t right_shift;
+    int8_t output_min;
+    int8_t output_max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
+    uint32_t shift;
+    uint32_t b_multiplier;
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int16_t output_min[8];
+    XNN_ALIGN(16) int16_t output_max[8];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
+    uint32_t shift;
+    uint32_t b_multiplier;
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+    XNN_ALIGN(16) int8_t output_max[16];
+  } sse4_mul16;
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) int32_t a_multiplier[4];
+    XNN_ALIGN(16) int32_t b_multiplier[4];
+    XNN_ALIGN(16) uint64_t shift[2];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+    XNN_ALIGN(16) int8_t output_max[16];
+  } sse4_mul32;
+  struct {
+    XNN_ALIGN(32) int32_t bias[8];
+    XNN_ALIGN(32) int32_t a_multiplier[8];
+    XNN_ALIGN(32) int32_t b_multiplier[8];
+    XNN_ALIGN(32) uint64_t shift[4];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(16) int8_t output_min[16];
+    XNN_ALIGN(16) int8_t output_max[16];
+  } avx2;
+  struct {
+    XNN_ALIGN(64) int32_t bias[16];
+    XNN_ALIGN(64) int32_t a_multiplier[16];
+    XNN_ALIGN(64) int32_t b_multiplier[16];
+    XNN_ALIGN(64) uint64_t shift[8];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(32) int8_t output_min[32];
+    XNN_ALIGN(32) int8_t output_max[32];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int32_t bias[2];
+    XNN_ALIGN(8) int32_t a_multiplier[2];
+    XNN_ALIGN(8) int32_t b_multiplier[2];
+    uint32_t shift;
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+    XNN_ALIGN(8) int8_t output_min[8];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_addsub_minmax_params {
+  struct {
+    int32_t bias;
+    int32_t a_multiplier;
+    int32_t b_multiplier;
+    int32_t rounding;
+    uint32_t shift;
+    int32_t output_min_less_zero_point;
+    int32_t output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint8_t a_zero_point;
+    uint8_t b_zero_point;
+    int16_t output_zero_point;
+    int32_t a_multiplier;
+    int32_t b_multiplier;
+    int32_t right_shift;
+    uint8_t output_min;
+    uint8_t output_max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
+    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
+    uint32_t shift;
+    uint32_t b_multiplier;
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) int32_t a_multiplier[4];
+    XNN_ALIGN(16) int32_t b_multiplier[4];
+    XNN_ALIGN(16) uint64_t shift[2];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
+  } sse4;
+  struct {
+    XNN_ALIGN(32) int32_t bias[8];
+    XNN_ALIGN(32) int32_t a_multiplier[8];
+    XNN_ALIGN(32) int32_t b_multiplier[8];
+    XNN_ALIGN(32) uint64_t shift[4];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
+  } avx2;
+  struct {
+    XNN_ALIGN(64) int32_t bias[16];
+    XNN_ALIGN(64) int32_t a_multiplier[16];
+    XNN_ALIGN(64) int32_t b_multiplier[16];
+    XNN_ALIGN(64) uint64_t shift[8];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(32) uint8_t output_min[32];
+    XNN_ALIGN(32) uint8_t output_max[32];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int32_t bias[2];
+    XNN_ALIGN(8) int32_t a_multiplier[2];
+    XNN_ALIGN(8) int32_t b_multiplier[2];
+    uint32_t shift;
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+    XNN_ALIGN(8) uint8_t output_min[8];
+    XNN_ALIGN(8) uint8_t output_max[8];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Mul Min+Max: used by quantized VMUL[C] microkernels with MINMAX activation.
+
+union xnn_qs8_mul_minmax_params {
+  struct {
+    int32_t a_zero_point;
+    int32_t b_zero_point;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } fp32_scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int8_t a_zero_point[2];
+    int8_t b_zero_point[2];
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } fp32_neon;
+  struct {
+    int8_t a_zero_point[2];
+    int8_t b_zero_point[2];
+    float scale;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } fp32_neonv8;
+  struct {
+    int8_t a_zero_point[2];
+    int8_t b_zero_point[2];
+    int32_t left_pre_shift;
+    int32_t multiplier;
+    int32_t left_post_shift;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } rndnu_neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t a_zero_point[8];
+    XNN_ALIGN(16) int16_t b_zero_point[8];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int16_t output_min[8];
+    XNN_ALIGN(16) int16_t output_max[8];
+  } fp32_sse2;
+  struct {
+    XNN_ALIGN(16) int16_t a_zero_point[8];
+    XNN_ALIGN(16) int16_t b_zero_point[8];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+    XNN_ALIGN(16) int8_t output_max[16];
+  } fp32_sse4;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t a_zero_point[4];
+    XNN_ALIGN(8) int16_t b_zero_point[4];
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } fp32_wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_mul_minmax_params {
+  struct {
+    int32_t a_zero_point;
+    int32_t b_zero_point;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } fp32_scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint8_t a_zero_point[2];
+    uint8_t b_zero_point[2];
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } fp32_neon;
+  struct {
+    uint8_t a_zero_point[2];
+    uint8_t b_zero_point[2];
+    float scale;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } fp32_neonv8;
+  struct {
+    uint8_t a_zero_point[2];
+    uint8_t b_zero_point[2];
+    int32_t left_pre_shift;
+    int32_t multiplier;
+    int32_t left_post_shift;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } rndnu_neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t a_zero_point[8];
+    XNN_ALIGN(16) int16_t b_zero_point[8];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
+  } fp32_sse2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t a_zero_point[4];
+    XNN_ALIGN(8) int16_t b_zero_point[4];
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) uint8_t output_max[8];
+  } fp32_wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// AvgPool Min+Max: used by quantized GAVGPOOL microkernels with MINMAX activation.
+
+union xnn_qs8_avgpool_minmax_params {
+  struct {
+    int32_t init_bias;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } fp32_scalar_fmagic;
+  struct {
+    int32_t init_bias;
+    float scale;
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } fp32_scalar_imagic;
+  struct {
+    int32_t init_bias;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } fp32_scalar_lrintf;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int32_t init_bias;
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } fp32_neon;
+  struct {
+    int32_t init_bias;
+    float scale;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } fp32_neonv8;
+  struct {
+    int32_t init_bias;
+    int32_t left_pre_shift;
+    int32_t multiplier;
+    int32_t left_post_shift;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } rndnu_neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t init_bias[4];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int16_t output_min[8];
+  } fp32_sse2;
+  struct {
+    XNN_ALIGN(16) int32_t init_bias[4];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+  } fp32_sse4;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int32_t init_bias[2];
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } fp32_wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_avgpool_minmax_params {
+  struct {
+    int32_t init_bias;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+  } fp32_scalar_fmagic;
+  struct {
+    int32_t init_bias;
+    float scale;
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } fp32_scalar_imagic;
+  struct {
+    int32_t init_bias;
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } fp32_scalar_lrintf;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int32_t init_bias;
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } fp32_neon;
+  struct {
+    int32_t init_bias;
+    float scale;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } fp32_neonv8;
+  struct {
+    int32_t init_bias;
+    int32_t left_pre_shift;
+    int32_t multiplier;
+    int32_t left_post_shift;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } rndnu_neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t init_bias[4];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+  } fp32_sse2;
+  struct {
+    XNN_ALIGN(16) int32_t init_bias[4];
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+  } fp32_sse4;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int32_t init_bias[2];
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+    XNN_ALIGN(8) uint8_t output_max[8];
+  } fp32_wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+  // Legacy parameters used by QU8 AVGPOOL microkernels
+  struct {
+    int32_t bias;
+    int32_t multiplier;
+    int64_t rounding;
+    uint32_t right_shift;
+    int32_t output_min_less_zero_point;
+    int32_t output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int32_t bias;
+    int32_t multiplier;
+    int64_t left_shift;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int32_t bias[4];
+    XNN_ALIGN(16) uint32_t multiplier[4];
+    XNN_ALIGN(16) uint64_t rounding[2];
+    XNN_ALIGN(16) uint64_t right_shift[2];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    XNN_ALIGN(16) uint8_t output_max[16];
+  } sse2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+
+// Abs: used by VABS microkernels.
+
+union xnn_f16_abs_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint16_t nonsign_mask[8];
+  } sse;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_abs_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float nonsign_mask[4];
+  } sse;
+  struct {
+    XNN_ALIGN(32) float nonsign_mask[8];
+    int32_t mask_table[14];
+  } avx;
+  struct {
+    uint32_t nonsign_mask;
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float nonsign_mask[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Cvt (Convert): used by VCVT microkernels.
+
+union xnn_f16_f32_cvt_params {
+  struct {
+    uint32_t sign_mask;
+    uint32_t exp_offset;
+    float exp_scale;
+    uint32_t magic_mask;
+    float magic_bias;
+    uint32_t denorm_cutoff;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float exp_scale;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint16_t sign_mask[8];
+    XNN_ALIGN(16) uint16_t exp_offset[8];
+    XNN_ALIGN(16) float exp_scale[4];
+    XNN_ALIGN(16) uint16_t magic_mask[8];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) int16_t denorm_cutoff[8];
+  } sse_int16;
+  struct {
+    XNN_ALIGN(16) uint32_t sign_mask[4];
+    XNN_ALIGN(16) uint32_t exp_offset[4];
+    XNN_ALIGN(16) float exp_scale[4];
+    XNN_ALIGN(16) uint32_t magic_bias[4];
+    XNN_ALIGN(16) int32_t denorm_cutoff[4];
+  } sse_int32;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) uint16_t sign_mask[4];
+    XNN_ALIGN(8) uint16_t exp_offset[4];
+    XNN_ALIGN(8) float exp_scale[2];
+    XNN_ALIGN(8) uint16_t magic_mask[4];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int16_t denorm_cutoff[4];
+  } wasmsimd_int16;
+  struct {
+    XNN_ALIGN(8) uint32_t sign_mask[2];
+    XNN_ALIGN(8) uint32_t exp_offset[2];
+    XNN_ALIGN(8) float exp_scale[2];
+    XNN_ALIGN(8) uint32_t magic_bias[2];
+    XNN_ALIGN(8) int32_t denorm_cutoff[2];
+  } wasmsimd_int32;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_f32_f16_cvt_params {
+  struct {
+    uint32_t nonsign_mask;
+    uint32_t exp_bias;
+    float scale_to_inf;
+    uint32_t expw_max;
+    float scale_to_zero;
+    uint32_t bias_min;
+    uint16_t exph_mask;
+    uint16_t manth_mask;
+    uint16_t nanh;
+  } scalar_bitcast;
+  struct {
+    float scale_to_inf;
+    uint32_t exp_bias;
+    float scale_to_zero;
+    uint32_t expw_max;
+    uint32_t bias_min;
+    uint16_t exph_mask;
+    uint16_t manth_mask;
+    uint16_t nanh;
+  } scalar_fabsf;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint32_t exp_bias;
+    float scale_to_inf;
+    uint32_t expw_max;
+    float scale_to_zero;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint32_t nonsign_mask[4];
+    XNN_ALIGN(16) uint32_t exp_bias[4];
+    XNN_ALIGN(16) float scale_to_inf[4];
+    XNN_ALIGN(16) uint32_t expw_max[4];
+    XNN_ALIGN(16) float scale_to_zero[4];
+    XNN_ALIGN(16) int16_t bias_min[8];
+    XNN_ALIGN(16) uint32_t manth_mask[4];
+    XNN_ALIGN(16) uint32_t exph_mask[4];
+    XNN_ALIGN(16) uint16_t nanh[8];
+  } sse2;
+  struct {
+    int32_t mask_table[14];
+  } f16c;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) uint32_t exp_bias[2];
+    XNN_ALIGN(8) float scale_to_inf[2];
+    XNN_ALIGN(8) uint32_t expw_max[2];
+    XNN_ALIGN(8) float scale_to_zero[2];
+    XNN_ALIGN(8) int16_t bias_min[4];
+    XNN_ALIGN(8) uint32_t manth_mask[2];
+    XNN_ALIGN(8) uint32_t exph_mask[2];
+    XNN_ALIGN(8) uint16_t nanh[4];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_f32_qs8_cvt_params {
+  struct {
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_zero_point;
+  } scalar_fmagic;
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } scalar_imagic;
+  struct {
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar_lrintf;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } neon;
+  struct {
+    float scale;
+    int16_t output_zero_point;
+    int8_t output_min;
+    int8_t output_max;
+  } neonv8;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int16_t output_min[8];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+  } sse4;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+    int32_t mask_table[14];
+  } avx;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(32) uint32_t shuffle_mask[8];
+    XNN_ALIGN(32) int8_t output_min[32];
+    int32_t mask_table[14];
+  } avx2;
+  struct {
+    XNN_ALIGN(64) float scale[16];
+    XNN_ALIGN(64) float output_max_less_zero_point[16];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(64) int8_t output_min[64];
+    XNN_ALIGN(64) uint32_t shuffle512_mask[16];
+    XNN_ALIGN(32) uint32_t shuffle256_mask[8];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+    XNN_ALIGN(8) int8_t output_min[8];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } wasmsimd_cvt;
+  struct {
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
+    XNN_ALIGN(8) int8_t output_max[8];
+  } wasmsimd_magic;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_f32_qu8_cvt_params {
+  struct {
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    float magic_bias;
+    int32_t magic_bias_less_zero_point;
+  } scalar_fmagic;
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_min;
+    int32_t magic_max;
+    int32_t magic_bias_less_zero_point;
+  } scalar_imagic;
+  struct {
+    float scale;
+    float output_min_less_zero_point;
+    float output_max_less_zero_point;
+    int32_t output_zero_point;
+  } scalar_lrintf;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float scale;
+    float magic_bias;
+    int32_t magic_bias_less_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } neon;
+  struct {
+    float scale;
+    int16_t output_zero_point;
+    uint8_t output_min;
+    uint8_t output_max;
+  } neonv8;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float scale[4];
+    XNN_ALIGN(16) float output_max_less_zero_point[4];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+  } sse2;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) uint8_t output_min[16];
+    int32_t mask_table[14];
+  } avx;
+  struct {
+    XNN_ALIGN(32) float scale[8];
+    XNN_ALIGN(32) float output_max_less_zero_point[8];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(32) uint32_t shuffle_mask[8];
+    XNN_ALIGN(32) uint8_t output_min[32];
+    int32_t mask_table[14];
+  } avx2;
+  struct {
+    XNN_ALIGN(64) float scale[16];
+    XNN_ALIGN(64) float output_max_less_zero_point[16];
+    XNN_ALIGN(64) int16_t output_zero_point[32];
+    XNN_ALIGN(64) uint8_t output_min[64];
+    XNN_ALIGN(64) uint32_t shuffle512_mask[16];
+    XNN_ALIGN(32) uint32_t shuffle256_mask[8];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+    XNN_ALIGN(8) uint8_t output_min[8];
+    XNN_ALIGN(8) uint8_t output_max[8];
+  } wasmsimd_cvt;
+  struct {
+    XNN_ALIGN(8) float scale[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) int32_t magic_min[2];
+    XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
+    XNN_ALIGN(8) uint8_t output_max[8];
+  } wasmsimd_magic;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qs8_cvt_params {
+  struct {
+    int32_t bias;
+    int32_t multiplier;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint32_t minus_input_zero_point;
+    int32_t multiplier;
+    int32_t bias;
+  } armv6simd;
+  struct {
+    int16_t input_zero_point;
+    int16_t multiplier;
+    int16_t output_zero_point;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t multiplier[8];
+    XNN_ALIGN(16) int32_t bias[4];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int16_t input_zero_point[8];
+    XNN_ALIGN(16) int16_t multiplier[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+  } ssse3;
+  struct {
+    XNN_ALIGN(32) int16_t input_zero_point[16];
+    XNN_ALIGN(32) int16_t multiplier[16];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+  } avx2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t input_zero_point[4];
+    XNN_ALIGN(8) int16_t multiplier[4];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qs8_f32_cvt_params {
+  struct {
+    int32_t zero_point;
+    float scale;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int16_t minus_zero_point[2];
+    float scale;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint8_t sign_mask[16];
+    XNN_ALIGN(16) uint16_t magic_exp[8];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float scale[4];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int32_t minus_zero_point[4];
+    XNN_ALIGN(16) float scale[4];
+  } sse4;
+  struct {
+    XNN_ALIGN(32) int32_t minus_zero_point[8];
+    XNN_ALIGN(32) float scale[8];
+  } avx;
+  struct {
+    XNN_ALIGN(64) int32_t minus_zero_point[16];
+    XNN_ALIGN(64) float scale[16];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t minus_zero_point[4];
+    XNN_ALIGN(8) float scale[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_cvt_params {
+  struct {
+    int32_t bias;
+    int32_t multiplier;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint32_t minus_input_zero_point;
+    int32_t multiplier;
+    int32_t bias;
+  } armv6simd;
+  struct {
+    uint16_t input_zero_point;
+    int16_t multiplier;
+    int16_t output_zero_point;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint16_t multiplier[8];
+    XNN_ALIGN(16) int32_t bias[4];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) uint16_t input_zero_point[8];
+    XNN_ALIGN(16) int16_t multiplier[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+  } ssse3;
+  struct {
+    XNN_ALIGN(32) uint16_t input_zero_point[16];
+    XNN_ALIGN(32) int16_t multiplier[16];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+  } avx2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) uint16_t input_zero_point[4];
+    XNN_ALIGN(8) int16_t multiplier[4];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_f32_cvt_params {
+  struct {
+    int32_t zero_point;
+    float scale;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    int16_t minus_zero_point[2];
+    float scale;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint16_t magic_exp[8];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float scale[4];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int32_t minus_zero_point[4];
+    XNN_ALIGN(16) float scale[4];
+  } sse4;
+  struct {
+    XNN_ALIGN(32) int32_t minus_zero_point[8];
+    XNN_ALIGN(32) float scale[8];
+  } avx;
+  struct {
+    XNN_ALIGN(64) int32_t minus_zero_point[16];
+    XNN_ALIGN(64) float scale[16];
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t minus_zero_point[4];
+    XNN_ALIGN(8) float scale[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// ELU: used by VELU microkernels.
+
+union xnn_f16_elu_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t prescale;
+    uint16_t sat_cutoff;
+    uint16_t magic_bias;
+    uint16_t log2e;
+    uint16_t minus_ln2;
+    uint16_t c3;
+    uint16_t c2;
+    uint16_t minus_alpha;
+    uint16_t beta;
+  } neonfp16arith_rr1_p3;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float c1[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+  } avx2_rr1_p3;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_elu_params {
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c3;
+    float c2;
+    float one;
+  } scalar_rr2_lut16_p3;
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c6;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float one;
+  } scalar_rr2_p6;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c6;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+  } neon_rr2_p6;
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c3;
+    float c2;
+  } neon_rr2_lut16_p3;
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2;
+    float c6;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+  } neonfma_rr1_p6;
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2;
+    float c3;
+    float c2;
+  } neonfma_rr1_lut16_p3;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float prescale[4];
+    XNN_ALIGN(16) float alpha[4];
+    XNN_ALIGN(16) float beta[4];
+    XNN_ALIGN(16) float sat_cutoff[4];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float log2e[4];
+    XNN_ALIGN(16) uint32_t index_mask[4];
+    XNN_ALIGN(16) float minus_ln2_hi[4];
+    XNN_ALIGN(16) float minus_ln2_lo[4];
+    XNN_ALIGN(16) float c3[4];
+    XNN_ALIGN(16) float c2[4];
+    XNN_ALIGN(16) float one[4];
+  } sse2_rr2_lut16_p3;
+  struct {
+    XNN_ALIGN(16) float prescale[4];
+    XNN_ALIGN(16) float alpha[4];
+    XNN_ALIGN(16) float beta[4];
+    XNN_ALIGN(16) float sat_cutoff[4];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float log2e[4];
+    XNN_ALIGN(16) float minus_ln2_hi[4];
+    XNN_ALIGN(16) float minus_ln2_lo[4];
+    XNN_ALIGN(16) float c6[4];
+    XNN_ALIGN(16) float c5[4];
+    XNN_ALIGN(16) float c4[4];
+    XNN_ALIGN(16) float c3[4];
+    XNN_ALIGN(16) float c2[4];
+    XNN_ALIGN(16) float one[4];
+  } sse2_rr2_p6;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) uint32_t index_mask[8];
+    XNN_ALIGN(32) float minus_ln2_hi[8];
+    XNN_ALIGN(32) float minus_ln2_lo[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float one[8];
+    int32_t mask_table[14];
+  } avx_rr2_lut16_p3;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) uint32_t index_mask[8];
+    XNN_ALIGN(32) float table[8];
+    XNN_ALIGN(32) float minus_ln2_hi[8];
+    XNN_ALIGN(32) float minus_ln2_lo[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float one[8];
+    int32_t mask_table[14];
+  } avx_rr2_lut4_p4;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2_hi[8];
+    XNN_ALIGN(32) float minus_ln2_lo[8];
+    XNN_ALIGN(32) float c6[8];
+    XNN_ALIGN(32) float c5[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float one[8];
+    int32_t mask_table[14];
+  } avx_rr2_p6;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) uint32_t index_mask[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    int32_t mask_table[14];
+  } avx2_rr1_lut16_p3;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) uint32_t table[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    int32_t mask_table[14];
+  } avx2_rr1_lut8_p4;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float table[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    int32_t mask_table[14];
+  } avx2_rr1_lut4_p4;
+  struct {
+    XNN_ALIGN(32) float prescale[8];
+    XNN_ALIGN(32) float alpha[8];
+    XNN_ALIGN(32) float beta[8];
+    XNN_ALIGN(32) float sat_cutoff[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c6[8];
+    XNN_ALIGN(32) float c5[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    int32_t mask_table[14];
+  } avx2_rr1_p6;
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2;
+    float c3;
+    float c2;
+    XNN_ALIGN(64) uint32_t table[16];
+  } avx512_rr1_lut16_p3;
+  struct {
+    float prescale;
+    float alpha;
+    float beta;
+    float sat_cutoff;
+    float magic_bias;
+    float log2e;
+    float minus_ln2;
+    float c6;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+  } avx512_rr1_p6;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float prescale[2];
+    XNN_ALIGN(8) float alpha[2];
+    XNN_ALIGN(8) float beta[2];
+    XNN_ALIGN(8) float sat_cutoff[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) float log2e[2];
+    XNN_ALIGN(8) uint32_t index_mask[2];
+    XNN_ALIGN(8) float minus_ln2_hi[2];
+    XNN_ALIGN(8) float minus_ln2_lo[2];
+    XNN_ALIGN(8) float c3[2];
+    XNN_ALIGN(8) float c2[2];
+    XNN_ALIGN(8) float one[2];
+  } wasmsimd_rr2_lut16_p3;
+  struct {
+    XNN_ALIGN(8) float prescale[2];
+    XNN_ALIGN(8) float alpha[2];
+    XNN_ALIGN(8) float beta[2];
+    XNN_ALIGN(8) float sat_cutoff[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) float log2e[2];
+    XNN_ALIGN(8) float minus_ln2_hi[2];
+    XNN_ALIGN(8) float minus_ln2_lo[2];
+    XNN_ALIGN(8) float c6[2];
+    XNN_ALIGN(8) float c5[2];
+    XNN_ALIGN(8) float c4[2];
+    XNN_ALIGN(8) float c3[2];
+    XNN_ALIGN(8) float c2[2];
+    XNN_ALIGN(8) float one[2];
+  } wasmsimd_rr2_p6;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// ExpMinus: used by RADDEXPMINUSMAX microkernels.
+
+union xnn_f16_expminus_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t magic_bias;
+    uint16_t log2e;
+    uint16_t minus_ln2_hi;
+    uint16_t minus_ln2_lo;
+    uint16_t c2;
+    uint16_t c1;
+    uint16_t denorm_cutoff;
+  } neonfp16arith_rr2_p2;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float c1[8];
+    XNN_ALIGN(32) float denorm_cutoff[8];
+  } avx2_rr1_p2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_expminus_params {
+  struct {
+    float log2e;
+    float magic_bias;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float denorm_cutoff;
+  } scalar_rr2_p5;
+  struct {
+    float log2e;
+    float magic_bias;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c2;
+    float denorm_cutoff;
+  } scalar_rr2_lut64_p2;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float log2e;
+    float magic_bias;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float denorm_cutoff;
+  } neon_rr2_p5;
+  struct {
+    float log2e;
+    float magic_bias;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c2;
+    float denorm_cutoff;
+  } neon_rr2_lut64_p2;
+  struct {
+    float log2e;
+    float magic_bias;
+    float minus_ln2;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float denorm_cutoff;
+  } neonfma_rr1_p5;
+  struct {
+    float log2e;
+    float magic_bias;
+    float minus_ln2;
+    float c2;
+    float denorm_cutoff;
+  } neonfma_rr1_lut64_p2;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float log2e[4];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float minus_ln2_hi[4];
+    XNN_ALIGN(16) float minus_ln2_lo[4];
+    XNN_ALIGN(16) float c5[4];
+    XNN_ALIGN(16) float c4[4];
+    XNN_ALIGN(16) float c3[4];
+    XNN_ALIGN(16) float c2[4];
+    XNN_ALIGN(16) float c1[4];
+    XNN_ALIGN(16) float denorm_cutoff[4];
+  } sse2_rr2_p5;
+  struct {
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c5[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float c1[8];
+    XNN_ALIGN(32) float denorm_cutoff[8];
+    int32_t mask_table[14];
+  } avx2_rr1_p5;
+  struct {
+    float log2e;
+    float minus_ln2;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float c0;
+  } avx512_rr1_p5;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float log2e[2];
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) float minus_ln2_hi[2];
+    XNN_ALIGN(8) float minus_ln2_lo[2];
+    XNN_ALIGN(8) float c5[2];
+    XNN_ALIGN(8) float c4[2];
+    XNN_ALIGN(8) float c3[2];
+    XNN_ALIGN(8) float c2[2];
+    XNN_ALIGN(8) float c1[2];
+    XNN_ALIGN(8) float denorm_cutoff[2];
+  } wasmsimd_rr2_p5;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// HSwish: used by VHSWISH microkernels.
+
+union xnn_f16_hswish_params {
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t sixth;
+    uint16_t three;
+    uint16_t six;
+    uint16_t pad;  // pad to 8 bytes for neonfp16arith assembly.
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float sixth[8];
+    XNN_ALIGN(32) float three[8];
+    XNN_ALIGN(16) uint16_t six[8];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_hswish_params {
+  struct {
+    float sixth;
+    float three;
+    float six;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float sixth[4];
+    XNN_ALIGN(16) float half[4];
+    XNN_ALIGN(16) float one[4];
+  } sse;
+  struct {
+    XNN_ALIGN(32) float sixth[8];
+    XNN_ALIGN(32) float half[8];
+    XNN_ALIGN(32) float one[8];
+    int32_t mask_table[14];
+  } avx;
+  struct {
+    float sixth;
+    float half;
+    float one;
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float sixth[2];
+    XNN_ALIGN(8) float three[2];
+    XNN_ALIGN(8) float six[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// LReLU (Leaky ReLU): used by VLRELU microkernels.
+
+union xnn_f16_lrelu_params {
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t slope;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float slope[8];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_lrelu_params {
+  struct {
+    float slope;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float slope[4];
+  } sse;
+  struct {
+    XNN_ALIGN(32) float slope[8];
+    int32_t mask_table[14];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float slope[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qs8_lrelu_params {
+  struct {
+    int32_t input_zero_point;
+    int32_t positive_multiplier;
+    int32_t negative_multiplier;
+    int32_t bias;
+  } scalar_select;
+  struct {
+    int32_t input_zero_point;
+    int32_t multiplier_diff;
+    int32_t multiplier_base;
+    int32_t bias;
+  } scalar_andxor;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint32_t input_zero_point;
+    uint32_t positive_multiplier;
+    uint32_t negative_multiplier;
+    int32_t bias;
+  } armv6simd;
+  struct {
+    int16_t input_zero_point;
+    int16_t positive_multiplier;
+    int16_t negative_multiplier;
+    int16_t output_zero_point;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t input_zero_point[8];
+    XNN_ALIGN(16) int16_t multiplier_diff[8];
+    XNN_ALIGN(16) int16_t multiplier_base[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int16_t input_zero_point[8];
+    XNN_ALIGN(16) int16_t positive_multiplier[8];
+    XNN_ALIGN(16) int16_t negative_multiplier[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+  } avx;
+  struct {
+    XNN_ALIGN(32) int16_t input_zero_point[16];
+    XNN_ALIGN(32) int16_t positive_multiplier[16];
+    XNN_ALIGN(32) int16_t negative_multiplier[16];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+  } avx2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t input_zero_point[4];
+    XNN_ALIGN(8) int16_t positive_multiplier[4];
+    XNN_ALIGN(8) int16_t negative_multiplier[4];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+  } wasmsimd_arm;
+  struct {
+    XNN_ALIGN(8) int16_t input_zero_point[4];
+    XNN_ALIGN(8) int16_t multiplier_diff[4];
+    XNN_ALIGN(8) int16_t multiplier_base[4];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+  } wasmsimd_x86;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+union xnn_qu8_lrelu_params {
+  struct {
+    int32_t input_zero_point;
+    int32_t positive_multiplier;
+    int32_t negative_multiplier;
+    int32_t bias;
+  } scalar_select;
+  struct {
+    int32_t input_zero_point;
+    int32_t multiplier_base;
+    int32_t multiplier_diff;
+    int32_t bias;
+  } scalar_andxor;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint32_t input_zero_point;
+    uint32_t positive_multiplier;
+    uint32_t negative_multiplier;
+    int32_t bias;
+  } armv6simd;
+  struct {
+    uint16_t input_zero_point;
+    int16_t positive_multiplier;
+    int16_t negative_multiplier;
+    int16_t output_zero_point;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) int16_t input_zero_point[8];
+    XNN_ALIGN(16) int16_t multiplier_diff[8];
+    XNN_ALIGN(16) int16_t multiplier_base[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+  } sse2;
+  struct {
+    XNN_ALIGN(16) int16_t input_zero_point[8];
+    XNN_ALIGN(16) int16_t positive_multiplier[8];
+    XNN_ALIGN(16) int16_t negative_multiplier[8];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+  } avx;
+  struct {
+    XNN_ALIGN(32) int16_t input_zero_point[16];
+    XNN_ALIGN(32) int16_t positive_multiplier[16];
+    XNN_ALIGN(32) int16_t negative_multiplier[16];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+  } avx2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) int16_t input_zero_point[4];
+    XNN_ALIGN(8) int16_t positive_multiplier[4];
+    XNN_ALIGN(8) int16_t negative_multiplier[4];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+  } wasmsimd_arm;
+  struct {
+    XNN_ALIGN(8) int16_t input_zero_point[4];
+    XNN_ALIGN(8) int16_t multiplier_diff[4];
+    XNN_ALIGN(8) int16_t multiplier_base[4];
+    XNN_ALIGN(8) int16_t output_zero_point[4];
+  } wasmsimd_x86;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Neg: used by VNEG microkernels.
+
+union xnn_f16_neg_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) uint16_t sign_mask[8];
+  } sse;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_neg_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float sign_mask[4];
+  } sse;
+  struct {
+    XNN_ALIGN(32) float sign_mask[8];
+    int32_t mask_table[14];
+  } avx;
+  struct {
+    uint32_t sign_mask;
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float sign_mask[2];
+  } wasmsimd;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Rnd (Round): used by VRNDNE/VRNDU/VRNDD/VRNDZ microkernels.
+
+union xnn_f16_rnd_params {
+  char _; // Dummy member variable to comply with the C standard
+};
+
+union xnn_f32_rnd_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float sign_mask[4];
+    XNN_ALIGN(16) float one[4];
+  } sse2;
+  struct {
+    int32_t mask_table[14];
+  } avx;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+
+// Sigmoid: used by VSIGMOID microkernels.
+
+union xnn_f16_sigmoid_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t magic_bias;
+    uint16_t minus_log2e;
+    uint16_t ln2_hi;
+    uint16_t ln2_lo;
+    uint16_t c2;
+    uint16_t c1;
+    uint16_t denorm_cutoff;
+  } neonfp16arith_rr2_p2;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(32) float sign_mask[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float c1[8];
+    XNN_ALIGN(32) float one[8];
+    XNN_ALIGN(32) float denorm_cutoff[8];
+  } avx2_rr1_p2;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+union xnn_f32_sigmoid_params {
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2_hi;
+    float ln2_lo;
+    float c1;
+    float one;
+    float denorm_cutoff;
+  } scalar_rr2_lut2048_p1;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2_hi;
+    float ln2_lo;
+    float c2;
+    float one;
+    float denorm_cutoff;
+  } scalar_rr2_lut64_p2;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2_hi;
+    float ln2_lo;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float one;
+    float denorm_cutoff;
+  } scalar_rr2_p5;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2_hi;
+    float ln2_lo;
+    float c1;
+    float denorm_cutoff;
+  } neon_rr2_lut2048_p1;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2_hi;
+    float ln2_lo;
+    float c2;
+    float denorm_cutoff;
+  } neon_rr2_lut64_p2;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2_hi;
+    float ln2_lo;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float denorm_cutoff;
+  } neon_rr2_p5;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2;
+    float c1;
+    float denorm_cutoff;
+  } neonfma_rr1_lut2048_p1;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2;
+    float c2;
+    float denorm_cutoff;
+  } neonfma_rr1_lut64_p2;
+  struct {
+    float magic_bias;
+    float minus_log2e;
+    float ln2;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float denorm_cutoff;
+  } neonfma_rr1_p5;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float sign_mask[4];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float log2e[4];
+    XNN_ALIGN(16) uint32_t index_mask[4];
+    XNN_ALIGN(16) float minus_ln2_hi[4];
+    XNN_ALIGN(16) float minus_ln2_lo[4];
+    XNN_ALIGN(16) float c2[4];
+    XNN_ALIGN(16) float one[4];
+    XNN_ALIGN(16) float denorm_cutoff[4];
+  } sse2_rr2_lut64_p2;
+  struct {
+    XNN_ALIGN(16) float sign_mask[4];
+    XNN_ALIGN(16) float magic_bias[4];
+    XNN_ALIGN(16) float log2e[4];
+    XNN_ALIGN(16) float minus_ln2_hi[4];
+    XNN_ALIGN(16) float minus_ln2_lo[4];
+    XNN_ALIGN(16) float c5[4];
+    XNN_ALIGN(16) float c4[4];
+    XNN_ALIGN(16) float c3[4];
+    XNN_ALIGN(16) float c2[4];
+    XNN_ALIGN(16) float c1[4];
+    XNN_ALIGN(16) float one[4];
+    XNN_ALIGN(16) float denorm_cutoff[4];
+  } sse2_rr2_p5;
+  struct {
+    XNN_ALIGN(32) float sign_mask[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2_hi[8];
+    XNN_ALIGN(32) float minus_ln2_lo[8];
+    XNN_ALIGN(32) float c5[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float c1[8];
+    XNN_ALIGN(32) float one[8];
+    XNN_ALIGN(32) float two[8];
+    XNN_ALIGN(32) float denorm_cutoff[8];
+    int32_t mask_table[14];
+  } avx_rr2_p5;
+  struct {
+    XNN_ALIGN(32) float sign_mask[8];
+    XNN_ALIGN(32) float magic_bias[8];
+    XNN_ALIGN(32) float log2e[8];
+    XNN_ALIGN(32) float minus_ln2[8];
+    XNN_ALIGN(32) float c5[8];
+    XNN_ALIGN(32) float c4[8];
+    XNN_ALIGN(32) float c3[8];
+    XNN_ALIGN(32) float c2[8];
+    XNN_ALIGN(32) float c1[8];
+    XNN_ALIGN(32) float one[8];
+    XNN_ALIGN(32) float denorm_cutoff[8];
+    int32_t mask_table[14];
+  } avx2_rr1_p5;
+  struct {
+    uint32_t sign_mask;
+    float magic_bias;
+    float log2e;
+    float minus_ln2;
+    float c3;
+    float c2;
+    float one;
+    XNN_ALIGN(64) float table[16];
+  } avx512_rr1_lut16_p3;
+  struct {
+    uint32_t sign_mask;
+    float magic_bias;
+    float log2e;
+    float minus_ln2_hi;
+    float minus_ln2_lo;
+    float c2;
+    float c1;
+    float one;
+    XNN_ALIGN(64) float table_lo[16];
+    XNN_ALIGN(64) float table_hi[16];
+  } avx512_rr2_lut32_p2;
+  struct {
+    uint32_t sign_mask;
+    float log2e;
+    float minus_ln2;
+    float c5;
+    float c4;
+    float c3;
+    float c2;
+    float c1;
+    float one;
+  } avx512_rr1_p5;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  struct {
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) float minus_log2e[2];
+    XNN_ALIGN(8) uint32_t index_mask[2];
+    XNN_ALIGN(8) float ln2_hi[2];
+    XNN_ALIGN(8) float ln2_lo[2];
+    XNN_ALIGN(8) float c2[2];
+    XNN_ALIGN(8) float one[2];
+    XNN_ALIGN(8) float denorm_cutoff[2];
+  } wasmsimd_rr2_lut64_p2;
+  struct {
+    XNN_ALIGN(8) float magic_bias[2];
+    XNN_ALIGN(8) float minus_log2e[2];
+    XNN_ALIGN(8) float ln2_hi[2];
+    XNN_ALIGN(8) float ln2_lo[2];
+    XNN_ALIGN(8) float c5[2];
+    XNN_ALIGN(8) float c4[2];
+    XNN_ALIGN(8) float c3[2];
+    XNN_ALIGN(8) float c2[2];
+    XNN_ALIGN(8) float c1[2];
+    XNN_ALIGN(8) float one[2];
+    XNN_ALIGN(8) float denorm_cutoff[2];
+  } wasmsimd_rr2_p5;
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+};
+
+
+// Sqrt (Square Root): used by VSQRT microkernels.
+
+union xnn_f16_sqrt_params {
+  char _; // Dummy member variable to comply with the C standard
+};
+
+union xnn_f32_sqrt_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    int32_t mask_table[14];
+  } avx;
+  struct {
+    XNN_ALIGN(32) float half[8];
+    int32_t mask_table[14];
+  } fma;
+  struct {
+    float half;
+  } avx512;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+
+// CHW: used by CONV/DWCONV microkernels in CHW layout with Min+Max parameters.
+
+union xnn_f16_chw_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    uint16_t min;
+    uint16_t max;
+    XNN_ALIGN(8) uint16_t mask_even[4]; // used by stride 2 kernels
+    XNN_ALIGN(8) uint16_t mask_odd[4];  // used by stride 2 kernels
+    XNN_ALIGN(8) uint16_t mask[4]; // used by stride 1 kernels
+    XNN_ALIGN(16) uint16_t maskx8[8]; // used by stride 1 x8 kernels
+  } neonfp16arith;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+};
+
+union xnn_f32_chw_params {
+  struct {
+    XNN_ALIGN(16) int32_t mask_even[4]; // used by stride 2 kernels
+    XNN_ALIGN(16) int32_t mask_odd[4];  // used by stride 2 kernels
+    XNN_ALIGN(16) int32_t mask[4]; // used by stride 1 kernels
+    float min;
+    float max;
+  } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    float min;
+    float max;
+    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float min[4];
+    XNN_ALIGN(16) float max[4];
+    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
+    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
+  } sse;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
+
+// GAvgPool (Global Average Pool): used by GAVGPOOL microkernels in CHW layout with Scale+Min+Max parameters.
+
+union xnn_f16_gavgpool_params {
+  char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    XNN_ALIGN(16) uint16_t mask[8];
+    uint16_t multiplier;
+    uint16_t output_min;
+    uint16_t output_max;
+  } neonfp16arith;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
+};
+
+union xnn_f32_gavgpool_params {
+  struct {
+    XNN_ALIGN(16) int32_t mask[4];
+    float multiplier;
+    float output_min;
+    float output_max;
+  } scalar;
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  struct {
+    XNN_ALIGN(16) float multiplier[4];
+    XNN_ALIGN(16) float output_min[4];
+    XNN_ALIGN(16) float output_max[4];
+    XNN_ALIGN(16) uint32_t mask[4];
+  } sse;
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  struct {
+    XNN_ALIGN(16) uint32_t mask[4];
+    float multiplier;
+    float output_min;
+    float output_max;
+  } neon;
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
+};
+
+// Forward declare for use in microkernel headers for JIT generator functions.
+struct xnn_code_buffer;
+
+// JIT GEMM: used by GEMM/IGEMM microkernel generators.
+
+struct jit_gemm_params {
+  struct {
+    float min;
+    float max;
+  } f32_minmax;
+};
diff --git a/src/xnnpack/pad.h b/src/xnnpack/pad.h
index 835c38cc4..c4ea02578 100644
--- a/src/xnnpack/pad.h
+++ b/src/xnnpack/pad.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 5216d2ca9..a596b9b24 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -14,2413 +14,9 @@
 
 #include <xnnpack.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 
-union xnn_f16_default_params {
-  // Empty; serves to differentiate pointer types for micro-kernels without fused activation.
-  char _; // Dummy member variable to comply with the C standard
-};
-
-// scaleminmax is used for avgpool ukernels.
-union xnn_f16_scaleminmax_params {
-  // Empty; serves to differentiate pointer types for micro-kernels without fused activation.
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t scale;
-    uint16_t min;
-    uint16_t max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float min[8];
-    XNN_ALIGN(32) float max[8];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f16_minmax_params {
-  // Empty; serves to differentiate pointer types for micro-kernels without fused activation.
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t min;
-    uint16_t max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float min[8];
-    XNN_ALIGN(32) float max[8];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_default_params {
-  // Empty; serves to differentiate pointer types for micro-kernels without fused activation.
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    int32_t mask_table[14];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_relu_params {
-  // Empty; serves to differentiate pointer types for micro-kernels with different fused activations.
-  char _; // Dummy member variable to comply with the C standard
-};
-
-union xnn_f32_minmax_params {
-  struct {
-    float min;
-    float max;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float min[4];
-    XNN_ALIGN(16) float max[4];
-  } sse;
-  struct {
-    XNN_ALIGN(32) float min[8];
-    XNN_ALIGN(32) float max[8];
-    int32_t mask_table[14];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float min[2];
-    XNN_ALIGN(8) float max[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f32_abs_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float nonsign_mask[4];
-  } sse;
-  struct {
-    XNN_ALIGN(32) float nonsign_mask[8];
-    int32_t mask_table[14];
-  } avx;
-  struct {
-    uint32_t nonsign_mask;
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float nonsign_mask[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f32_neg_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float sign_mask[4];
-  } sse;
-  struct {
-    XNN_ALIGN(32) float sign_mask[8];
-    int32_t mask_table[14];
-  } avx;
-  struct {
-    uint32_t sign_mask;
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float sign_mask[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f16_abs_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint16_t nonsign_mask[8];
-  } sse;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f16_neg_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint16_t sign_mask[8];
-  } sse;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f16_rnd_params {
-  char _; // Dummy member variable to comply with the C standard
-};
-
-union xnn_f32_rnd_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float sign_mask[4];
-    XNN_ALIGN(16) float one[4];
-  } sse2;
-  struct {
-    int32_t mask_table[14];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f16_elu_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t prescale;
-    uint16_t sat_cutoff;
-    uint16_t magic_bias;
-    uint16_t log2e;
-    uint16_t minus_ln2;
-    uint16_t c3;
-    uint16_t c2;
-    uint16_t minus_alpha;
-    uint16_t beta;
-  } neonfp16arith_rr1_p3;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float c1[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-  } avx2_rr1_p3;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_elu_params {
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c3;
-    float c2;
-    float one;
-  } scalar_rr2_lut16_p3;
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c6;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float one;
-  } scalar_rr2_p6;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c6;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-  } neon_rr2_p6;
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c3;
-    float c2;
-  } neon_rr2_lut16_p3;
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2;
-    float c6;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-  } neonfma_rr1_p6;
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2;
-    float c3;
-    float c2;
-  } neonfma_rr1_lut16_p3;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float prescale[4];
-    XNN_ALIGN(16) float alpha[4];
-    XNN_ALIGN(16) float beta[4];
-    XNN_ALIGN(16) float sat_cutoff[4];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float log2e[4];
-    XNN_ALIGN(16) uint32_t index_mask[4];
-    XNN_ALIGN(16) float minus_ln2_hi[4];
-    XNN_ALIGN(16) float minus_ln2_lo[4];
-    XNN_ALIGN(16) float c3[4];
-    XNN_ALIGN(16) float c2[4];
-    XNN_ALIGN(16) float one[4];
-  } sse2_rr2_lut16_p3;
-  struct {
-    XNN_ALIGN(16) float prescale[4];
-    XNN_ALIGN(16) float alpha[4];
-    XNN_ALIGN(16) float beta[4];
-    XNN_ALIGN(16) float sat_cutoff[4];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float log2e[4];
-    XNN_ALIGN(16) float minus_ln2_hi[4];
-    XNN_ALIGN(16) float minus_ln2_lo[4];
-    XNN_ALIGN(16) float c6[4];
-    XNN_ALIGN(16) float c5[4];
-    XNN_ALIGN(16) float c4[4];
-    XNN_ALIGN(16) float c3[4];
-    XNN_ALIGN(16) float c2[4];
-    XNN_ALIGN(16) float one[4];
-  } sse2_rr2_p6;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) uint32_t index_mask[8];
-    XNN_ALIGN(32) float minus_ln2_hi[8];
-    XNN_ALIGN(32) float minus_ln2_lo[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float one[8];
-    int32_t mask_table[14];
-  } avx_rr2_lut16_p3;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) uint32_t index_mask[8];
-    XNN_ALIGN(32) float table[8];
-    XNN_ALIGN(32) float minus_ln2_hi[8];
-    XNN_ALIGN(32) float minus_ln2_lo[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float one[8];
-    int32_t mask_table[14];
-  } avx_rr2_lut4_p4;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2_hi[8];
-    XNN_ALIGN(32) float minus_ln2_lo[8];
-    XNN_ALIGN(32) float c6[8];
-    XNN_ALIGN(32) float c5[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float one[8];
-    int32_t mask_table[14];
-  } avx_rr2_p6;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) uint32_t index_mask[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    int32_t mask_table[14];
-  } avx2_rr1_lut16_p3;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) uint32_t table[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    int32_t mask_table[14];
-  } avx2_rr1_lut8_p4;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float table[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    int32_t mask_table[14];
-  } avx2_rr1_lut4_p4;
-  struct {
-    XNN_ALIGN(32) float prescale[8];
-    XNN_ALIGN(32) float alpha[8];
-    XNN_ALIGN(32) float beta[8];
-    XNN_ALIGN(32) float sat_cutoff[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c6[8];
-    XNN_ALIGN(32) float c5[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    int32_t mask_table[14];
-  } avx2_rr1_p6;
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2;
-    float c3;
-    float c2;
-    XNN_ALIGN(64) uint32_t table[16];
-  } avx512_rr1_lut16_p3;
-  struct {
-    float prescale;
-    float alpha;
-    float beta;
-    float sat_cutoff;
-    float magic_bias;
-    float log2e;
-    float minus_ln2;
-    float c6;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-  } avx512_rr1_p6;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float prescale[2];
-    XNN_ALIGN(8) float alpha[2];
-    XNN_ALIGN(8) float beta[2];
-    XNN_ALIGN(8) float sat_cutoff[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) float log2e[2];
-    XNN_ALIGN(8) uint32_t index_mask[2];
-    XNN_ALIGN(8) float minus_ln2_hi[2];
-    XNN_ALIGN(8) float minus_ln2_lo[2];
-    XNN_ALIGN(8) float c3[2];
-    XNN_ALIGN(8) float c2[2];
-    XNN_ALIGN(8) float one[2];
-  } wasmsimd_rr2_lut16_p3;
-  struct {
-    XNN_ALIGN(8) float prescale[2];
-    XNN_ALIGN(8) float alpha[2];
-    XNN_ALIGN(8) float beta[2];
-    XNN_ALIGN(8) float sat_cutoff[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) float log2e[2];
-    XNN_ALIGN(8) float minus_ln2_hi[2];
-    XNN_ALIGN(8) float minus_ln2_lo[2];
-    XNN_ALIGN(8) float c6[2];
-    XNN_ALIGN(8) float c5[2];
-    XNN_ALIGN(8) float c4[2];
-    XNN_ALIGN(8) float c3[2];
-    XNN_ALIGN(8) float c2[2];
-    XNN_ALIGN(8) float one[2];
-  } wasmsimd_rr2_p6;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f16_expminus_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t magic_bias;
-    uint16_t log2e;
-    uint16_t minus_ln2_hi;
-    uint16_t minus_ln2_lo;
-    uint16_t c2;
-    uint16_t c1;
-    uint16_t denorm_cutoff;
-  } neonfp16arith_rr2_p2;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float c1[8];
-    XNN_ALIGN(32) float denorm_cutoff[8];
-  } avx2_rr1_p2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_expminus_params {
-  struct {
-    float log2e;
-    float magic_bias;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float denorm_cutoff;
-  } scalar_rr2_p5;
-  struct {
-    float log2e;
-    float magic_bias;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c2;
-    float denorm_cutoff;
-  } scalar_rr2_lut64_p2;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float log2e;
-    float magic_bias;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float denorm_cutoff;
-  } neon_rr2_p5;
-  struct {
-    float log2e;
-    float magic_bias;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c2;
-    float denorm_cutoff;
-  } neon_rr2_lut64_p2;
-  struct {
-    float log2e;
-    float magic_bias;
-    float minus_ln2;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float denorm_cutoff;
-  } neonfma_rr1_p5;
-  struct {
-    float log2e;
-    float magic_bias;
-    float minus_ln2;
-    float c2;
-    float denorm_cutoff;
-  } neonfma_rr1_lut64_p2;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float log2e[4];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float minus_ln2_hi[4];
-    XNN_ALIGN(16) float minus_ln2_lo[4];
-    XNN_ALIGN(16) float c5[4];
-    XNN_ALIGN(16) float c4[4];
-    XNN_ALIGN(16) float c3[4];
-    XNN_ALIGN(16) float c2[4];
-    XNN_ALIGN(16) float c1[4];
-    XNN_ALIGN(16) float denorm_cutoff[4];
-  } sse2_rr2_p5;
-  struct {
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c5[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float c1[8];
-    XNN_ALIGN(32) float denorm_cutoff[8];
-    int32_t mask_table[14];
-  } avx2_rr1_p5;
-  struct {
-    float log2e;
-    float minus_ln2;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float c0;
-  } avx512_rr1_p5;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float log2e[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) float minus_ln2_hi[2];
-    XNN_ALIGN(8) float minus_ln2_lo[2];
-    XNN_ALIGN(8) float c5[2];
-    XNN_ALIGN(8) float c4[2];
-    XNN_ALIGN(8) float c3[2];
-    XNN_ALIGN(8) float c2[2];
-    XNN_ALIGN(8) float c1[2];
-    XNN_ALIGN(8) float denorm_cutoff[2];
-  } wasmsimd_rr2_p5;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f16_lrelu_params {
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t slope;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float slope[8];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_lrelu_params {
-  struct {
-    float slope;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float slope[4];
-  } sse;
-  struct {
-    XNN_ALIGN(32) float slope[8];
-    int32_t mask_table[14];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float slope[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f16_sigmoid_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t magic_bias;
-    uint16_t minus_log2e;
-    uint16_t ln2_hi;
-    uint16_t ln2_lo;
-    uint16_t c2;
-    uint16_t c1;
-    uint16_t denorm_cutoff;
-  } neonfp16arith_rr2_p2;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float sign_mask[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float c1[8];
-    XNN_ALIGN(32) float one[8];
-    XNN_ALIGN(32) float denorm_cutoff[8];
-  } avx2_rr1_p2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_sigmoid_params {
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2_hi;
-    float ln2_lo;
-    float c1;
-    float one;
-    float denorm_cutoff;
-  } scalar_rr2_lut2048_p1;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2_hi;
-    float ln2_lo;
-    float c2;
-    float one;
-    float denorm_cutoff;
-  } scalar_rr2_lut64_p2;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2_hi;
-    float ln2_lo;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float one;
-    float denorm_cutoff;
-  } scalar_rr2_p5;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2_hi;
-    float ln2_lo;
-    float c1;
-    float denorm_cutoff;
-  } neon_rr2_lut2048_p1;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2_hi;
-    float ln2_lo;
-    float c2;
-    float denorm_cutoff;
-  } neon_rr2_lut64_p2;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2_hi;
-    float ln2_lo;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float denorm_cutoff;
-  } neon_rr2_p5;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2;
-    float c1;
-    float denorm_cutoff;
-  } neonfma_rr1_lut2048_p1;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2;
-    float c2;
-    float denorm_cutoff;
-  } neonfma_rr1_lut64_p2;
-  struct {
-    float magic_bias;
-    float minus_log2e;
-    float ln2;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float denorm_cutoff;
-  } neonfma_rr1_p5;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float sign_mask[4];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float log2e[4];
-    XNN_ALIGN(16) uint32_t index_mask[4];
-    XNN_ALIGN(16) float minus_ln2_hi[4];
-    XNN_ALIGN(16) float minus_ln2_lo[4];
-    XNN_ALIGN(16) float c2[4];
-    XNN_ALIGN(16) float one[4];
-    XNN_ALIGN(16) float denorm_cutoff[4];
-  } sse2_rr2_lut64_p2;
-  struct {
-    XNN_ALIGN(16) float sign_mask[4];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float log2e[4];
-    XNN_ALIGN(16) float minus_ln2_hi[4];
-    XNN_ALIGN(16) float minus_ln2_lo[4];
-    XNN_ALIGN(16) float c5[4];
-    XNN_ALIGN(16) float c4[4];
-    XNN_ALIGN(16) float c3[4];
-    XNN_ALIGN(16) float c2[4];
-    XNN_ALIGN(16) float c1[4];
-    XNN_ALIGN(16) float one[4];
-    XNN_ALIGN(16) float denorm_cutoff[4];
-  } sse2_rr2_p5;
-  struct {
-    XNN_ALIGN(32) float sign_mask[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2_hi[8];
-    XNN_ALIGN(32) float minus_ln2_lo[8];
-    XNN_ALIGN(32) float c5[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float c1[8];
-    XNN_ALIGN(32) float one[8];
-    XNN_ALIGN(32) float two[8];
-    XNN_ALIGN(32) float denorm_cutoff[8];
-    int32_t mask_table[14];
-  } avx_rr2_p5;
-  struct {
-    XNN_ALIGN(32) float sign_mask[8];
-    XNN_ALIGN(32) float magic_bias[8];
-    XNN_ALIGN(32) float log2e[8];
-    XNN_ALIGN(32) float minus_ln2[8];
-    XNN_ALIGN(32) float c5[8];
-    XNN_ALIGN(32) float c4[8];
-    XNN_ALIGN(32) float c3[8];
-    XNN_ALIGN(32) float c2[8];
-    XNN_ALIGN(32) float c1[8];
-    XNN_ALIGN(32) float one[8];
-    XNN_ALIGN(32) float denorm_cutoff[8];
-    int32_t mask_table[14];
-  } avx2_rr1_p5;
-  struct {
-    uint32_t sign_mask;
-    float magic_bias;
-    float log2e;
-    float minus_ln2;
-    float c3;
-    float c2;
-    float one;
-    XNN_ALIGN(64) float table[16];
-  } avx512_rr1_lut16_p3;
-  struct {
-    uint32_t sign_mask;
-    float magic_bias;
-    float log2e;
-    float minus_ln2_hi;
-    float minus_ln2_lo;
-    float c2;
-    float c1;
-    float one;
-    XNN_ALIGN(64) float table_lo[16];
-    XNN_ALIGN(64) float table_hi[16];
-  } avx512_rr2_lut32_p2;
-  struct {
-    uint32_t sign_mask;
-    float log2e;
-    float minus_ln2;
-    float c5;
-    float c4;
-    float c3;
-    float c2;
-    float c1;
-    float one;
-  } avx512_rr1_p5;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) float minus_log2e[2];
-    XNN_ALIGN(8) uint32_t index_mask[2];
-    XNN_ALIGN(8) float ln2_hi[2];
-    XNN_ALIGN(8) float ln2_lo[2];
-    XNN_ALIGN(8) float c2[2];
-    XNN_ALIGN(8) float one[2];
-    XNN_ALIGN(8) float denorm_cutoff[2];
-  } wasmsimd_rr2_lut64_p2;
-  struct {
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) float minus_log2e[2];
-    XNN_ALIGN(8) float ln2_hi[2];
-    XNN_ALIGN(8) float ln2_lo[2];
-    XNN_ALIGN(8) float c5[2];
-    XNN_ALIGN(8) float c4[2];
-    XNN_ALIGN(8) float c3[2];
-    XNN_ALIGN(8) float c2[2];
-    XNN_ALIGN(8) float c1[2];
-    XNN_ALIGN(8) float one[2];
-    XNN_ALIGN(8) float denorm_cutoff[2];
-  } wasmsimd_rr2_p5;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f16_sqrt_params {
-  char _; // Dummy member variable to comply with the C standard
-};
-
-union xnn_f32_sqrt_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    int32_t mask_table[14];
-  } avx;
-  struct {
-    XNN_ALIGN(32) float half[8];
-    int32_t mask_table[14];
-  } fma;
-  struct {
-    float half;
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_chw_params {
-  struct {
-    XNN_ALIGN(16) int32_t mask_even[4]; // used by stride 2 kernels
-    XNN_ALIGN(16) int32_t mask_odd[4];  // used by stride 2 kernels
-    XNN_ALIGN(16) int32_t mask[4]; // used by stride 1 kernels
-    float min;
-    float max;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float min;
-    float max;
-    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
-    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
-    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float min[4];
-    XNN_ALIGN(16) float max[4];
-    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
-    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
-    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
-  } sse;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f16_chw_params {
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t min;
-    uint16_t max;
-    XNN_ALIGN(8) uint16_t mask_even[4]; // used by stride 2 kernels
-    XNN_ALIGN(8) uint16_t mask_odd[4];  // used by stride 2 kernels
-    XNN_ALIGN(8) uint16_t mask[4]; // used by stride 1 kernels
-    XNN_ALIGN(16) uint16_t maskx8[8]; // used by stride 1 x8 kernels
-  } neonfp16arith;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-};
-
-union xnn_s8_minmax_params {
-  struct {
-    int32_t min;
-    int32_t max;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint8_t bias[16];
-    XNN_ALIGN(16) uint8_t min_with_bias[16];
-    XNN_ALIGN(16) uint8_t max_with_bias[16];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int8_t min[16];
-    XNN_ALIGN(16) int8_t max[16];
-  } sse4;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int8_t min;
-    int8_t max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int8_t min[8];
-    XNN_ALIGN(8) int8_t max[8];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_u8_minmax_params {
-  struct {
-    uint32_t min;
-    uint32_t max;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint8_t min[16];
-    XNN_ALIGN(16) uint8_t max[16];
-  } sse2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint8_t min;
-    uint8_t max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) uint8_t min[8];
-    XNN_ALIGN(8) uint8_t max[8];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f32_scaleminmax_params {
-  struct {
-    float scale;
-    float min;
-    float max;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float min[4];
-    XNN_ALIGN(16) float max[4];
-  } sse;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_gavgpool_params {
-  struct {
-    XNN_ALIGN(16) int32_t mask[4];
-    float multiplier;
-    float output_min;
-    float output_max;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float multiplier[4];
-    XNN_ALIGN(16) float output_min[4];
-    XNN_ALIGN(16) float output_max[4];
-    XNN_ALIGN(16) uint32_t mask[4];
-  } sse;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    XNN_ALIGN(16) uint32_t mask[4];
-    float multiplier;
-    float output_min;
-    float output_max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
-};
-
-union xnn_f16_gavgpool_params {
-  // Empty; serves to differentiate pointer types for micro-kernels without fused activation.
-  char _; // Dummy member variable to comply with the C standard
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    XNN_ALIGN(16) uint16_t mask[8];
-    uint16_t multiplier;
-    uint16_t output_min;
-    uint16_t output_max;
-  } neonfp16arith;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
-};
-
-union xnn_f16_hswish_params {
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint16_t sixth;
-    uint16_t three;
-    uint16_t six;
-    uint16_t pad;  // pad to 8 bytes for neonfp16arith assembly.
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(32) float sixth[8];
-    XNN_ALIGN(32) float three[8];
-    XNN_ALIGN(16) uint16_t six[8];
-  } avx;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_f32_hswish_params {
-  struct {
-    float sixth;
-    float three;
-    float six;
-  } scalar;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float sixth[4];
-    XNN_ALIGN(16) float half[4];
-    XNN_ALIGN(16) float one[4];
-  } sse;
-  struct {
-    XNN_ALIGN(32) float sixth[8];
-    XNN_ALIGN(32) float half[8];
-    XNN_ALIGN(32) float one[8];
-    int32_t mask_table[14];
-  } avx;
-  struct {
-    float sixth;
-    float half;
-    float one;
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float sixth[2];
-    XNN_ALIGN(8) float three[2];
-    XNN_ALIGN(8) float six[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_conv_minmax_params {
-  struct {
-    int32_t kernel_zero_point;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } fp32_scalar_fmagic;
-  struct {
-    int32_t kernel_zero_point;
-    float scale;
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } fp32_scalar_imagic;
-  struct {
-    int32_t kernel_zero_point;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } fp32_scalar_lrintf;
-#if XNN_ARCH_ARM
-  struct {
-    float scale;
-    float magic_bias;
-    uint32_t minus_kernel_zero_point;
-    int32_t magic_bias_less_zero_point;
-    uint32_t output_min;
-    uint32_t output_max;
-  } fp32_armv6simd;
-#endif  // XNN_ARCH_ARM
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint8_t kernel_zero_point[4];
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } fp32_neon;
-  struct {
-    uint8_t kernel_zero_point[4];
-    float scale;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } fp32_neonv8;
-  struct {
-    uint8_t kernel_zero_point[4];
-    int32_t right_pre_shift;
-    int32_t multiplier;
-    int32_t right_post_shift;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } rndnu_neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int16_t kernel_zero_point[8];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-  } fp32_sse2;
-  struct {
-    XNN_ALIGN(32) int16_t kernel_zero_point[16];
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(32) uint8_t output_min[32];
-  } fp32_avx2;
-  struct {
-    XNN_ALIGN(64) int16_t kernel_zero_point[32];
-    XNN_ALIGN(64) float scale[16];
-    XNN_ALIGN(64) float output_max_less_zero_point[16];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(64) uint8_t output_min[64];
-  } fp32_avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t kernel_zero_point[4];
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } fp32_wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_minmax_params {
-  struct {
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } scalar_imagic;
-  struct {
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } scalar_fmagic;
-  struct {
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } scalar_lrintf;
-#if XNN_ARCH_ARM
-  struct {
-    float magic_bias;
-    int32_t magic_bias_less_zero_point;
-    uint32_t output_min;
-    uint32_t output_max;
-  } armv6simd;
-#endif  // XNN_ARCH_ARM
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } neon;
-  struct {
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } neonv8;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int16_t output_min[8];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-  } sse4;
-  struct {
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(32) int8_t output_min[32];
-  } avx2;
-  struct {
-    XNN_ALIGN(64) float output_max_less_zero_point[16];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(64) int8_t output_min[64];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_conv_minmax_params {
-  struct {
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } fp32_scalar_fmagic;
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } fp32_scalar_imagic;
-  struct {
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } fp32_scalar_lrintf;
-#if XNN_ARCH_ARM
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_zero_point;
-    uint32_t output_min;
-    uint32_t output_max;
-  } fp32_armv6simd;
-#endif  // XNN_ARCH_ARM
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } fp32_neon;
-  struct {
-    float scale;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } fp32_neonv8;
-  struct {
-    int32_t right_pre_shift;
-    int32_t multiplier;
-    int32_t right_post_shift;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } rndnu_neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int16_t output_min[8];
-  } fp32_sse2;
-  struct {
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-  } fp32_sse4;
-  struct {
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(32) int8_t output_min[32];
-  } fp32_avx2;
-  struct {
-    XNN_ALIGN(64) float scale[16];
-    XNN_ALIGN(64) float output_max_less_zero_point[16];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(64) int8_t output_min[64];
-  } fp32_avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } fp32_wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_addsub_minmax_params {
-  struct {
-    int32_t bias;
-    int32_t a_multiplier;
-    int32_t b_multiplier;
-    int32_t rounding;
-    uint32_t shift;
-    int32_t output_min_less_zero_point;
-    int32_t output_max_less_zero_point;
-    int32_t output_zero_point;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint8_t a_zero_point;
-    uint8_t b_zero_point;
-    int16_t output_zero_point;
-    int32_t a_multiplier;
-    int32_t b_multiplier;
-    int32_t right_shift;
-    uint8_t output_min;
-    uint8_t output_max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int32_t bias[4];
-    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
-    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
-    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
-    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
-    uint32_t shift;
-    uint32_t b_multiplier;
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-    XNN_ALIGN(16) uint8_t output_max[16];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int32_t bias[4];
-    XNN_ALIGN(16) int32_t a_multiplier[4];
-    XNN_ALIGN(16) int32_t b_multiplier[4];
-    XNN_ALIGN(16) uint64_t shift[2];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-    XNN_ALIGN(16) uint8_t output_max[16];
-  } sse4;
-  struct {
-    XNN_ALIGN(32) int32_t bias[8];
-    XNN_ALIGN(32) int32_t a_multiplier[8];
-    XNN_ALIGN(32) int32_t b_multiplier[8];
-    XNN_ALIGN(32) uint64_t shift[4];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(16) uint8_t output_min[16];
-    XNN_ALIGN(16) uint8_t output_max[16];
-  } avx2;
-  struct {
-    XNN_ALIGN(64) int32_t bias[16];
-    XNN_ALIGN(64) int32_t a_multiplier[16];
-    XNN_ALIGN(64) int32_t b_multiplier[16];
-    XNN_ALIGN(64) uint64_t shift[8];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(32) uint8_t output_min[32];
-    XNN_ALIGN(32) uint8_t output_max[32];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int32_t bias[2];
-    XNN_ALIGN(8) int32_t a_multiplier[2];
-    XNN_ALIGN(8) int32_t b_multiplier[2];
-    uint32_t shift;
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-    XNN_ALIGN(8) uint8_t output_min[8];
-    XNN_ALIGN(8) uint8_t output_max[8];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_addsub_minmax_params {
-  struct {
-    int32_t bias;
-    int32_t a_multiplier;
-    int32_t b_multiplier;
-    uint32_t shift;
-    int32_t output_min_less_zero_point;
-    int32_t output_max_less_zero_point;
-    int32_t output_zero_point;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int8_t a_zero_point;
-    int8_t b_zero_point;
-    int16_t output_zero_point;
-    int32_t a_multiplier;
-    int32_t b_multiplier;
-    int32_t right_shift;
-    int8_t output_min;
-    int8_t output_max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int32_t bias[4];
-    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
-    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
-    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
-    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
-    uint32_t shift;
-    uint32_t b_multiplier;
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int16_t output_min[8];
-    XNN_ALIGN(16) int16_t output_max[8];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int32_t bias[4];
-    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
-    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
-    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
-    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
-    uint32_t shift;
-    uint32_t b_multiplier;
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-    XNN_ALIGN(16) int8_t output_max[16];
-  } sse4_mul16;
-  struct {
-    XNN_ALIGN(16) int32_t bias[4];
-    XNN_ALIGN(16) int32_t a_multiplier[4];
-    XNN_ALIGN(16) int32_t b_multiplier[4];
-    XNN_ALIGN(16) uint64_t shift[2];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-    XNN_ALIGN(16) int8_t output_max[16];
-  } sse4_mul32;
-  struct {
-    XNN_ALIGN(32) int32_t bias[8];
-    XNN_ALIGN(32) int32_t a_multiplier[8];
-    XNN_ALIGN(32) int32_t b_multiplier[8];
-    XNN_ALIGN(32) uint64_t shift[4];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(16) int8_t output_min[16];
-    XNN_ALIGN(16) int8_t output_max[16];
-  } avx2;
-  struct {
-    XNN_ALIGN(64) int32_t bias[16];
-    XNN_ALIGN(64) int32_t a_multiplier[16];
-    XNN_ALIGN(64) int32_t b_multiplier[16];
-    XNN_ALIGN(64) uint64_t shift[8];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(32) int8_t output_min[32];
-    XNN_ALIGN(32) int8_t output_max[32];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int32_t bias[2];
-    XNN_ALIGN(8) int32_t a_multiplier[2];
-    XNN_ALIGN(8) int32_t b_multiplier[2];
-    uint32_t shift;
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-    XNN_ALIGN(8) int8_t output_min[8];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_mul_minmax_params {
-  struct {
-    int32_t a_zero_point;
-    int32_t b_zero_point;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } fp32_scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint8_t a_zero_point[2];
-    uint8_t b_zero_point[2];
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } fp32_neon;
-  struct {
-    uint8_t a_zero_point[2];
-    uint8_t b_zero_point[2];
-    float scale;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } fp32_neonv8;
-  struct {
-    uint8_t a_zero_point[2];
-    uint8_t b_zero_point[2];
-    int32_t left_pre_shift;
-    int32_t multiplier;
-    int32_t left_post_shift;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } rndnu_neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int16_t a_zero_point[8];
-    XNN_ALIGN(16) int16_t b_zero_point[8];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-    XNN_ALIGN(16) uint8_t output_max[16];
-  } fp32_sse2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t a_zero_point[4];
-    XNN_ALIGN(8) int16_t b_zero_point[4];
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) uint8_t output_max[8];
-  } fp32_wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_mul_minmax_params {
-  struct {
-    int32_t a_zero_point;
-    int32_t b_zero_point;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } fp32_scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int8_t a_zero_point[2];
-    int8_t b_zero_point[2];
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } fp32_neon;
-  struct {
-    int8_t a_zero_point[2];
-    int8_t b_zero_point[2];
-    float scale;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } fp32_neonv8;
-  struct {
-    int8_t a_zero_point[2];
-    int8_t b_zero_point[2];
-    int32_t left_pre_shift;
-    int32_t multiplier;
-    int32_t left_post_shift;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } rndnu_neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int16_t a_zero_point[8];
-    XNN_ALIGN(16) int16_t b_zero_point[8];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int16_t output_min[8];
-    XNN_ALIGN(16) int16_t output_max[8];
-  } fp32_sse2;
-  struct {
-    XNN_ALIGN(16) int16_t a_zero_point[8];
-    XNN_ALIGN(16) int16_t b_zero_point[8];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-    XNN_ALIGN(16) int8_t output_max[16];
-  } fp32_sse4;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t a_zero_point[4];
-    XNN_ALIGN(8) int16_t b_zero_point[4];
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } fp32_wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_lrelu_params {
-  struct {
-    int32_t input_zero_point;
-    int32_t positive_multiplier;
-    int32_t negative_multiplier;
-    int32_t bias;
-  } scalar_select;
-  struct {
-    int32_t input_zero_point;
-    int32_t multiplier_diff;
-    int32_t multiplier_base;
-    int32_t bias;
-  } scalar_andxor;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint32_t input_zero_point;
-    uint32_t positive_multiplier;
-    uint32_t negative_multiplier;
-    int32_t bias;
-  } armv6simd;
-  struct {
-    int16_t input_zero_point;
-    int16_t positive_multiplier;
-    int16_t negative_multiplier;
-    int16_t output_zero_point;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int16_t input_zero_point[8];
-    XNN_ALIGN(16) int16_t multiplier_diff[8];
-    XNN_ALIGN(16) int16_t multiplier_base[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int16_t input_zero_point[8];
-    XNN_ALIGN(16) int16_t positive_multiplier[8];
-    XNN_ALIGN(16) int16_t negative_multiplier[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-  } avx;
-  struct {
-    XNN_ALIGN(32) int16_t input_zero_point[16];
-    XNN_ALIGN(32) int16_t positive_multiplier[16];
-    XNN_ALIGN(32) int16_t negative_multiplier[16];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-  } avx2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t input_zero_point[4];
-    XNN_ALIGN(8) int16_t positive_multiplier[4];
-    XNN_ALIGN(8) int16_t negative_multiplier[4];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-  } wasmsimd_arm;
-  struct {
-    XNN_ALIGN(8) int16_t input_zero_point[4];
-    XNN_ALIGN(8) int16_t multiplier_diff[4];
-    XNN_ALIGN(8) int16_t multiplier_base[4];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-  } wasmsimd_x86;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_lrelu_params {
-  struct {
-    int32_t input_zero_point;
-    int32_t positive_multiplier;
-    int32_t negative_multiplier;
-    int32_t bias;
-  } scalar_select;
-  struct {
-    int32_t input_zero_point;
-    int32_t multiplier_base;
-    int32_t multiplier_diff;
-    int32_t bias;
-  } scalar_andxor;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint32_t input_zero_point;
-    uint32_t positive_multiplier;
-    uint32_t negative_multiplier;
-    int32_t bias;
-  } armv6simd;
-  struct {
-    uint16_t input_zero_point;
-    int16_t positive_multiplier;
-    int16_t negative_multiplier;
-    int16_t output_zero_point;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int16_t input_zero_point[8];
-    XNN_ALIGN(16) int16_t multiplier_diff[8];
-    XNN_ALIGN(16) int16_t multiplier_base[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int16_t input_zero_point[8];
-    XNN_ALIGN(16) int16_t positive_multiplier[8];
-    XNN_ALIGN(16) int16_t negative_multiplier[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-  } avx;
-  struct {
-    XNN_ALIGN(32) int16_t input_zero_point[16];
-    XNN_ALIGN(32) int16_t positive_multiplier[16];
-    XNN_ALIGN(32) int16_t negative_multiplier[16];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-  } avx2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t input_zero_point[4];
-    XNN_ALIGN(8) int16_t positive_multiplier[4];
-    XNN_ALIGN(8) int16_t negative_multiplier[4];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-  } wasmsimd_arm;
-  struct {
-    XNN_ALIGN(8) int16_t input_zero_point[4];
-    XNN_ALIGN(8) int16_t multiplier_diff[4];
-    XNN_ALIGN(8) int16_t multiplier_base[4];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-  } wasmsimd_x86;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_avgpool_minmax_params {
-  struct {
-    int32_t init_bias;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } fp32_scalar_fmagic;
-  struct {
-    int32_t init_bias;
-    float scale;
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } fp32_scalar_imagic;
-  struct {
-    int32_t init_bias;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } fp32_scalar_lrintf;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int32_t init_bias;
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } fp32_neon;
-  struct {
-    int32_t init_bias;
-    float scale;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } fp32_neonv8;
-  struct {
-    int32_t init_bias;
-    int32_t left_pre_shift;
-    int32_t multiplier;
-    int32_t left_post_shift;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } rndnu_neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int32_t init_bias[4];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-  } fp32_sse2;
-  struct {
-    XNN_ALIGN(16) int32_t init_bias[4];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-  } fp32_sse4;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int32_t init_bias[2];
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) uint8_t output_max[8];
-  } fp32_wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-
-  // Legacy parameters used by QU8 AVGPOOL microkernels
-  struct {
-    int32_t bias;
-    int32_t multiplier;
-    int64_t rounding;
-    uint32_t right_shift;
-    int32_t output_min_less_zero_point;
-    int32_t output_max_less_zero_point;
-    int32_t output_zero_point;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int32_t bias;
-    int32_t multiplier;
-    int64_t left_shift;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int32_t bias[4];
-    XNN_ALIGN(16) uint32_t multiplier[4];
-    XNN_ALIGN(16) uint64_t rounding[2];
-    XNN_ALIGN(16) uint64_t right_shift[2];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-    XNN_ALIGN(16) uint8_t output_max[16];
-  } sse2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-};
-
-union xnn_qs8_avgpool_minmax_params {
-  struct {
-    int32_t init_bias;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-  } fp32_scalar_fmagic;
-  struct {
-    int32_t init_bias;
-    float scale;
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } fp32_scalar_imagic;
-  struct {
-    int32_t init_bias;
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } fp32_scalar_lrintf;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int32_t init_bias;
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } fp32_neon;
-  struct {
-    int32_t init_bias;
-    float scale;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } fp32_neonv8;
-  struct {
-    int32_t init_bias;
-    int32_t left_pre_shift;
-    int32_t multiplier;
-    int32_t left_post_shift;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } rndnu_neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int32_t init_bias[4];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int16_t output_min[8];
-  } fp32_sse2;
-  struct {
-    XNN_ALIGN(16) int32_t init_bias[4];
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-  } fp32_sse4;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int32_t init_bias[2];
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } fp32_wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f16_f32_cvt_params {
-  struct {
-    uint32_t sign_mask;
-    uint32_t exp_offset;
-    float exp_scale;
-    uint32_t magic_mask;
-    float magic_bias;
-    uint32_t denorm_cutoff;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float exp_scale;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint16_t sign_mask[8];
-    XNN_ALIGN(16) uint16_t exp_offset[8];
-    XNN_ALIGN(16) float exp_scale[4];
-    XNN_ALIGN(16) uint16_t magic_mask[8];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) int16_t denorm_cutoff[8];
-  } sse_int16;
-  struct {
-    XNN_ALIGN(16) uint32_t sign_mask[4];
-    XNN_ALIGN(16) uint32_t exp_offset[4];
-    XNN_ALIGN(16) float exp_scale[4];
-    XNN_ALIGN(16) uint32_t magic_bias[4];
-    XNN_ALIGN(16) int32_t denorm_cutoff[4];
-  } sse_int32;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) uint16_t sign_mask[4];
-    XNN_ALIGN(8) uint16_t exp_offset[4];
-    XNN_ALIGN(8) float exp_scale[2];
-    XNN_ALIGN(8) uint16_t magic_mask[4];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int16_t denorm_cutoff[4];
-  } wasmsimd_int16;
-  struct {
-    XNN_ALIGN(8) uint32_t sign_mask[2];
-    XNN_ALIGN(8) uint32_t exp_offset[2];
-    XNN_ALIGN(8) float exp_scale[2];
-    XNN_ALIGN(8) uint32_t magic_bias[2];
-    XNN_ALIGN(8) int32_t denorm_cutoff[2];
-  } wasmsimd_int32;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f32_f16_cvt_params {
-  struct {
-    uint32_t nonsign_mask;
-    uint32_t exp_bias;
-    float scale_to_inf;
-    uint32_t expw_max;
-    float scale_to_zero;
-    uint32_t bias_min;
-    uint16_t exph_mask;
-    uint16_t manth_mask;
-    uint16_t nanh;
-  } scalar_bitcast;
-  struct {
-    float scale_to_inf;
-    uint32_t exp_bias;
-    float scale_to_zero;
-    uint32_t expw_max;
-    uint32_t bias_min;
-    uint16_t exph_mask;
-    uint16_t manth_mask;
-    uint16_t nanh;
-  } scalar_fabsf;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint32_t exp_bias;
-    float scale_to_inf;
-    uint32_t expw_max;
-    float scale_to_zero;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint32_t nonsign_mask[4];
-    XNN_ALIGN(16) uint32_t exp_bias[4];
-    XNN_ALIGN(16) float scale_to_inf[4];
-    XNN_ALIGN(16) uint32_t expw_max[4];
-    XNN_ALIGN(16) float scale_to_zero[4];
-    XNN_ALIGN(16) int16_t bias_min[8];
-    XNN_ALIGN(16) uint32_t manth_mask[4];
-    XNN_ALIGN(16) uint32_t exph_mask[4];
-    XNN_ALIGN(16) uint16_t nanh[8];
-  } sse2;
-  struct {
-    int32_t mask_table[14];
-  } f16c;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) uint32_t exp_bias[2];
-    XNN_ALIGN(8) float scale_to_inf[2];
-    XNN_ALIGN(8) uint32_t expw_max[2];
-    XNN_ALIGN(8) float scale_to_zero[2];
-    XNN_ALIGN(8) int16_t bias_min[4];
-    XNN_ALIGN(8) uint32_t manth_mask[2];
-    XNN_ALIGN(8) uint32_t exph_mask[2];
-    XNN_ALIGN(8) uint16_t nanh[4];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f32_qs8_cvt_params {
-  struct {
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_zero_point;
-  } scalar_fmagic;
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } scalar_imagic;
-  struct {
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } scalar_lrintf;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } neon;
-  struct {
-    float scale;
-    int16_t output_zero_point;
-    int8_t output_min;
-    int8_t output_max;
-  } neonv8;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int16_t output_min[8];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-  } sse4;
-  struct {
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) int8_t output_min[16];
-    int32_t mask_table[14];
-  } avx;
-  struct {
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(32) uint32_t shuffle_mask[8];
-    XNN_ALIGN(32) int8_t output_min[32];
-    int32_t mask_table[14];
-  } avx2;
-  struct {
-    XNN_ALIGN(64) float scale[16];
-    XNN_ALIGN(64) float output_max_less_zero_point[16];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(64) int8_t output_min[64];
-    XNN_ALIGN(64) uint32_t shuffle512_mask[16];
-    XNN_ALIGN(32) uint32_t shuffle256_mask[8];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-    XNN_ALIGN(8) int8_t output_min[8];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } wasmsimd_cvt;
-  struct {
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
-    XNN_ALIGN(8) int8_t output_max[8];
-  } wasmsimd_magic;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_f32_qu8_cvt_params {
-  struct {
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    float magic_bias;
-    int32_t magic_bias_less_zero_point;
-  } scalar_fmagic;
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_min;
-    int32_t magic_max;
-    int32_t magic_bias_less_zero_point;
-  } scalar_imagic;
-  struct {
-    float scale;
-    float output_min_less_zero_point;
-    float output_max_less_zero_point;
-    int32_t output_zero_point;
-  } scalar_lrintf;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    float scale;
-    float magic_bias;
-    int32_t magic_bias_less_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } neon;
-  struct {
-    float scale;
-    int16_t output_zero_point;
-    uint8_t output_min;
-    uint8_t output_max;
-  } neonv8;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) float scale[4];
-    XNN_ALIGN(16) float output_max_less_zero_point[4];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-  } sse2;
-  struct {
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-    XNN_ALIGN(16) uint8_t output_min[16];
-    int32_t mask_table[14];
-  } avx;
-  struct {
-    XNN_ALIGN(32) float scale[8];
-    XNN_ALIGN(32) float output_max_less_zero_point[8];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-    XNN_ALIGN(32) uint32_t shuffle_mask[8];
-    XNN_ALIGN(32) uint8_t output_min[32];
-    int32_t mask_table[14];
-  } avx2;
-  struct {
-    XNN_ALIGN(64) float scale[16];
-    XNN_ALIGN(64) float output_max_less_zero_point[16];
-    XNN_ALIGN(64) int16_t output_zero_point[32];
-    XNN_ALIGN(64) uint8_t output_min[64];
-    XNN_ALIGN(64) uint32_t shuffle512_mask[16];
-    XNN_ALIGN(32) uint32_t shuffle256_mask[8];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-    XNN_ALIGN(8) uint8_t output_min[8];
-    XNN_ALIGN(8) uint8_t output_max[8];
-  } wasmsimd_cvt;
-  struct {
-    XNN_ALIGN(8) float scale[2];
-    XNN_ALIGN(8) float magic_bias[2];
-    XNN_ALIGN(8) int32_t magic_min[2];
-    XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
-    XNN_ALIGN(8) uint8_t output_max[8];
-  } wasmsimd_magic;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_cvt_params {
-  struct {
-    int32_t bias;
-    int32_t multiplier;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint32_t minus_input_zero_point;
-    int32_t multiplier;
-    int32_t bias;
-  } armv6simd;
-  struct {
-    int16_t input_zero_point;
-    int16_t multiplier;
-    int16_t output_zero_point;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) int16_t multiplier[8];
-    XNN_ALIGN(16) int32_t bias[4];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int16_t input_zero_point[8];
-    XNN_ALIGN(16) int16_t multiplier[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-  } ssse3;
-  struct {
-    XNN_ALIGN(32) int16_t input_zero_point[16];
-    XNN_ALIGN(32) int16_t multiplier[16];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-  } avx2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t input_zero_point[4];
-    XNN_ALIGN(8) int16_t multiplier[4];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qs8_f32_cvt_params {
-  struct {
-    int32_t zero_point;
-    float scale;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int16_t minus_zero_point[2];
-    float scale;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint8_t sign_mask[16];
-    XNN_ALIGN(16) uint16_t magic_exp[8];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float scale[4];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int32_t minus_zero_point[4];
-    XNN_ALIGN(16) float scale[4];
-  } sse4;
-  struct {
-    XNN_ALIGN(32) int32_t minus_zero_point[8];
-    XNN_ALIGN(32) float scale[8];
-  } avx;
-  struct {
-    XNN_ALIGN(64) int32_t minus_zero_point[16];
-    XNN_ALIGN(64) float scale[16];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t minus_zero_point[4];
-    XNN_ALIGN(8) float scale[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_cvt_params {
-  struct {
-    int32_t bias;
-    int32_t multiplier;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    uint32_t minus_input_zero_point;
-    int32_t multiplier;
-    int32_t bias;
-  } armv6simd;
-  struct {
-    uint16_t input_zero_point;
-    int16_t multiplier;
-    int16_t output_zero_point;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint16_t multiplier[8];
-    XNN_ALIGN(16) int32_t bias[4];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) uint16_t input_zero_point[8];
-    XNN_ALIGN(16) int16_t multiplier[8];
-    XNN_ALIGN(16) int16_t output_zero_point[8];
-  } ssse3;
-  struct {
-    XNN_ALIGN(32) uint16_t input_zero_point[16];
-    XNN_ALIGN(32) int16_t multiplier[16];
-    XNN_ALIGN(32) int16_t output_zero_point[16];
-  } avx2;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) uint16_t input_zero_point[4];
-    XNN_ALIGN(8) int16_t multiplier[4];
-    XNN_ALIGN(8) int16_t output_zero_point[4];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
-union xnn_qu8_f32_cvt_params {
-  struct {
-    int32_t zero_point;
-    float scale;
-  } scalar;
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  struct {
-    int16_t minus_zero_point[2];
-    float scale;
-  } neon;
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  struct {
-    XNN_ALIGN(16) uint16_t magic_exp[8];
-    XNN_ALIGN(16) float magic_bias[4];
-    XNN_ALIGN(16) float scale[4];
-  } sse2;
-  struct {
-    XNN_ALIGN(16) int32_t minus_zero_point[4];
-    XNN_ALIGN(16) float scale[4];
-  } sse4;
-  struct {
-    XNN_ALIGN(32) int32_t minus_zero_point[8];
-    XNN_ALIGN(32) float scale[8];
-  } avx;
-  struct {
-    XNN_ALIGN(64) int32_t minus_zero_point[16];
-    XNN_ALIGN(64) float scale[16];
-  } avx512;
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-  struct {
-    XNN_ALIGN(8) int16_t minus_zero_point[4];
-    XNN_ALIGN(8) float scale[2];
-  } wasmsimd;
-#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-};
-
 typedef void (*xnn_ppmm_ukernel_function)(
     size_t mr,
     size_t nc,
@@ -4284,16 +1880,6 @@ typedef void (*xnn_init_qc8_scale_params_fn)(
   const float scale[XNN_MIN_ELEMENTS(1)],
   void* packed_w);
 
-// Forward declare to avoid circular includes between this and allocator.h.
-struct xnn_code_buffer;
-
-struct jit_gemm_params {
-  struct {
-    float min;
-    float max;
-  } f32_minmax;
-};
-
 typedef enum xnn_status (*xnn_jit_gemm_code_generator_function)(
     struct xnn_code_buffer *code, size_t max_mr, size_t nc, size_t kc, const void *params);
 typedef enum xnn_status (*xnn_jit_igemm_code_generator_function)(
diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
index 828cf35bd..6c4d6ab75 100644
--- a/src/xnnpack/pavgpool.h
+++ b/src/xnnpack/pavgpool.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/ppmm.h b/src/xnnpack/ppmm.h
index cec14872e..8379edfb9 100644
--- a/src/xnnpack/ppmm.h
+++ b/src/xnnpack/ppmm.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
index f3d999793..bb0d63706 100644
--- a/src/xnnpack/prelu.h
+++ b/src/xnnpack/prelu.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/raddexpminusmax.h b/src/xnnpack/raddexpminusmax.h
index 8f73d1939..ffc6407ed 100644
--- a/src/xnnpack/raddexpminusmax.h
+++ b/src/xnnpack/raddexpminusmax.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/raddextexp.h b/src/xnnpack/raddextexp.h
index 14381d1ab..f16e90be8 100644
--- a/src/xnnpack/raddextexp.h
+++ b/src/xnnpack/raddextexp.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h
index 62aecbad0..d1c487cb5 100644
--- a/src/xnnpack/raddstoreexpminusmax.h
+++ b/src/xnnpack/raddstoreexpminusmax.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
index 5da9597f7..39c3597c3 100644
--- a/src/xnnpack/requantization-stubs.h
+++ b/src/xnnpack/requantization-stubs.h
@@ -11,8 +11,6 @@
 #include <stdint.h>
 #include <stddef.h>
 
-#include <xnnpack/params.h>
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index a17a1b1b8..4fe12aedd 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -13,11 +13,9 @@
 #include <assert.h>
 #include <math.h>
 
-#include <fp16.h>
-
 #include <xnnpack/common.h>
 #include <xnnpack/math.h>
-#include <xnnpack/params.h>
+#include <xnnpack/microparams.h>
 
 
 typedef int8_t (*xnn_qs8_requantize_fn)(
diff --git a/src/xnnpack/rmaxabs.h b/src/xnnpack/rmaxabs.h
index e69f53a07..400bc43b8 100644
--- a/src/xnnpack/rmaxabs.h
+++ b/src/xnnpack/rmaxabs.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/spmm.h b/src/xnnpack/spmm.h
index 7cccab23a..dfce62b4c 100644
--- a/src/xnnpack/spmm.h
+++ b/src/xnnpack/spmm.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/transpose.h b/src/xnnpack/transpose.h
index 78a7dcc68..3634f6c8d 100644
--- a/src/xnnpack/transpose.h
+++ b/src/xnnpack/transpose.h
@@ -7,8 +7,8 @@
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <xnnpack/common.h>
-#include <xnnpack/params.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/unpool.h b/src/xnnpack/unpool.h
index 875c54fdd..7aef9989e 100644
--- a/src/xnnpack/unpool.h
+++ b/src/xnnpack/unpool.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vaddsub.h b/src/xnnpack/vaddsub.h
index c0c819507..c287522d8 100644
--- a/src/xnnpack/vaddsub.h
+++ b/src/xnnpack/vaddsub.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h
index 8dbd4e938..930344f9c 100644
--- a/src/xnnpack/vbinary.h
+++ b/src/xnnpack/vbinary.h
@@ -11,8 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index a6a0c500f..3ca9d53ac 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/vlrelu.h b/src/xnnpack/vlrelu.h
index 35cfe5031..4c83e8e3b 100644
--- a/src/xnnpack/vlrelu.h
+++ b/src/xnnpack/vlrelu.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/vlshift.h b/src/xnnpack/vlshift.h
index 271e28663..23481e514 100644
--- a/src/xnnpack/vlshift.h
+++ b/src/xnnpack/vlshift.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vmul.h b/src/xnnpack/vmul.h
index c01a2d69e..3441a8cfe 100644
--- a/src/xnnpack/vmul.h
+++ b/src/xnnpack/vmul.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
index 05480f21d..0be7cf50d 100644
--- a/src/xnnpack/vmulcaddc.h
+++ b/src/xnnpack/vmulcaddc.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/vscaleexpminusmax.h b/src/xnnpack/vscaleexpminusmax.h
index 2b0a407ce..afb864b79 100644
--- a/src/xnnpack/vscaleexpminusmax.h
+++ b/src/xnnpack/vscaleexpminusmax.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vscaleextexp.h b/src/xnnpack/vscaleextexp.h
index 6433ed69a..c84365e69 100644
--- a/src/xnnpack/vscaleextexp.h
+++ b/src/xnnpack/vscaleextexp.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h
index 9847f9ca7..aee3576d9 100644
--- a/src/xnnpack/vunary.h
+++ b/src/xnnpack/vunary.h
@@ -8,8 +8,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
+#include <xnnpack/microparams.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/xnnpack/window.h b/src/xnnpack/window.h
index 58fd0f3ad..dea724691 100644
--- a/src/xnnpack/window.h
+++ b/src/xnnpack/window.h
@@ -8,7 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/zip.h b/src/xnnpack/zip.h
index 52a2fffe5..e617e69c6 100644
--- a/src/xnnpack/zip.h
+++ b/src/xnnpack/zip.h
@@ -11,7 +11,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <xnnpack/params.h>
 #include <xnnpack/common.h>
 
 #ifdef __cplusplus
author	Marat Dukhan <maratek@google.com>	2022-07-27 21:14:38 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-07-27 21:15:38 -0700
commit	c836505ed4498a2ebd1c21050c383a0a60a8defc (patch)
tree	b26a80a0c5d6581794cc953414d7e05df7653ac9 /src/xnnpack
parent	917e63588c2664a12417beb01e59f9e4a10251bc (diff)
download	XNNPACK-c836505ed4498a2ebd1c21050c383a0a60a8defc.tar.gz