Rename asr_s32/asr_s64 to math_asr_s32/math_asr_s64

PiperOrigin-RevId: 463131507
author: Marat Dukhan <maratek@google.com> 2022-07-25 11:12:42 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-07-25 11:14:01 -0700
commit: 2247560904f5366d6d370bb080cfc2dbe9f57598 (patch)
tree: 447a97d6d3f25016ab6b6cd5c3f7a30a8a188795
parent: c7cb3c177fbcd277c29d0ead9eaf1390255591f0 (diff)
download: XNNPACK-2247560904f5366d6d370bb080cfc2dbe9f57598.tar.gz
69 files changed, 305 insertions, 305 deletions
diff --git a/src/qs8-requantization/gemmlowp-scalar.c b/src/qs8-requantization/gemmlowp-scalar.c
index 119e81edb..3bd4f42f1 100644
--- a/src/qs8-requantization/gemmlowp-scalar.c
+++ b/src/qs8-requantization/gemmlowp-scalar.c
@@ -100,10 +100,10 @@ void xnn_qs8_requantize_gemmlowp__scalar(
     const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t) (z_q31product < 0);
     const int32_t w_remainder = (w_q31product & remainder_mask) - (int32_t) (w_q31product < 0);
 
-    const int32_t x_scaled = asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold);
-    const int32_t y_scaled = asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold);
-    const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold);
-    const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
+    const int32_t x_scaled = math_asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold);
+    const int32_t y_scaled = math_asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold);
+    const int32_t z_scaled = math_asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold);
+    const int32_t w_scaled = math_asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
diff --git a/src/qs8-requantization/rndna-scalar-signed64.c b/src/qs8-requantization/rndna-scalar-signed64.c
index d04dc795a..e3d3d29db 100644
--- a/src/qs8-requantization/rndna-scalar-signed64.c
+++ b/src/qs8-requantization/rndna-scalar-signed64.c
@@ -65,10 +65,10 @@ void xnn_qs8_requantize_rndna__scalar_signed64(
     // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
     // "right shift with rounding" instruction each line below can be represented by just one such instruction
     // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
-    const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift);
-    const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift);
-    const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift);
-    const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift);
+    const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift);
+    const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift);
+    const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift);
+    const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
diff --git a/src/qs8-requantization/rndnu-scalar.c b/src/qs8-requantization/rndnu-scalar.c
index eafc7e768..84df119be 100644
--- a/src/qs8-requantization/rndnu-scalar.c
+++ b/src/qs8-requantization/rndnu-scalar.c
@@ -59,10 +59,10 @@ void xnn_qs8_requantize_rndnu__scalar(
     // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
     // "right shift with rounding" instruction each line below can be represented by just one such instruction
     // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
-    const int32_t x_scaled = (int32_t) asr_s64(x_product + rounding, shift);
-    const int32_t y_scaled = (int32_t) asr_s64(y_product + rounding, shift);
-    const int32_t z_scaled = (int32_t) asr_s64(z_product + rounding, shift);
-    const int32_t w_scaled = (int32_t) asr_s64(w_product + rounding, shift);
+    const int32_t x_scaled = (int32_t) math_asr_s64(x_product + rounding, shift);
+    const int32_t y_scaled = (int32_t) math_asr_s64(y_product + rounding, shift);
+    const int32_t z_scaled = (int32_t) math_asr_s64(z_product + rounding, shift);
+    const int32_t w_scaled = (int32_t) math_asr_s64(w_product + rounding, shift);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
diff --git a/src/qs8-vadd/gen/minmax-scalar-x1.c b/src/qs8-vadd/gen/minmax-scalar-x1.c
index e390b7ae4..7cd7b2c7a 100644
--- a/src/qs8-vadd/gen/minmax-scalar-x1.c
+++ b/src/qs8-vadd/gen/minmax-scalar-x1.c
@@ -33,7 +33,7 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x1(
     const int32_t vb = *input_b++;
     const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-scalar-x2.c b/src/qs8-vadd/gen/minmax-scalar-x2.c
index bb55d42b3..fac2cee4c 100644
--- a/src/qs8-vadd/gen/minmax-scalar-x2.c
+++ b/src/qs8-vadd/gen/minmax-scalar-x2.c
@@ -42,8 +42,8 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x2(
     vacc0 += vb0 * vb_multiplier;
     vacc1 += vb1 * vb_multiplier;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -63,7 +63,7 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x2(
     const int32_t vb = *input_b;
     const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-scalar-x4.c b/src/qs8-vadd/gen/minmax-scalar-x4.c
index cb3e69590..dc613a7bb 100644
--- a/src/qs8-vadd/gen/minmax-scalar-x4.c
+++ b/src/qs8-vadd/gen/minmax-scalar-x4.c
@@ -50,10 +50,10 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x4(
     vacc2 += vb2 * vb_multiplier;
     vacc3 += vb3 * vb_multiplier;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
-    int32_t vout2 = asr_s32(vacc2, vshift);
-    int32_t vout3 = asr_s32(vacc3, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
+    int32_t vout2 = math_asr_s32(vacc2, vshift);
+    int32_t vout3 = math_asr_s32(vacc3, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -82,7 +82,7 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x4(
       const int32_t vb = *input_b++;
       const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-      int32_t vout = asr_s32(vacc, vshift);
+      int32_t vout = math_asr_s32(vacc, vshift);
       vout = math_max_s32(vout, voutput_min_less_zero_point);
       vout = math_min_s32(vout, voutput_max_less_zero_point);
       *output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/scalar.c.in b/src/qs8-vadd/scalar.c.in
index 65ff4aa77..3cd4b8dcd 100644
--- a/src/qs8-vadd/scalar.c.in
+++ b/src/qs8-vadd/scalar.c.in
@@ -33,7 +33,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}(
       const int32_t vb = *input_b++;
       const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-      int32_t vout = asr_s32(vacc, vshift);
+      int32_t vout = math_asr_s32(vacc, vshift);
       vout = math_max_s32(vout, voutput_min_less_zero_point);
       vout = math_min_s32(vout, voutput_max_less_zero_point);
       *output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -55,7 +55,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}(
         vacc${N} += vb${N} * vb_multiplier;
 
       $for N in range(BATCH_TILE):
-        int32_t vout${N} = asr_s32(vacc${N}, vshift);
+        int32_t vout${N} = math_asr_s32(vacc${N}, vshift);
 
       $for N in range(BATCH_TILE):
         vout${N} = math_max_s32(vout${N}, voutput_min_less_zero_point);
@@ -76,7 +76,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}(
         const int32_t vb = *input_b;
         const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-        int32_t vout = asr_s32(vacc, vshift);
+        int32_t vout = math_asr_s32(vacc, vshift);
         vout = math_max_s32(vout, voutput_min_less_zero_point);
         vout = math_min_s32(vout, voutput_max_less_zero_point);
         *output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -86,7 +86,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}(
           const int32_t vb = *input_b++;
           const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-          int32_t vout = asr_s32(vacc, vshift);
+          int32_t vout = math_asr_s32(vacc, vshift);
           vout = math_max_s32(vout, voutput_min_less_zero_point);
           vout = math_min_s32(vout, voutput_max_less_zero_point);
           *output++ = (${XINT8_T}) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-scalar-x1.c b/src/qs8-vaddc/gen/minmax-scalar-x1.c
index f7dc69be4..34597dae2 100644
--- a/src/qs8-vaddc/gen/minmax-scalar-x1.c
+++ b/src/qs8-vaddc/gen/minmax-scalar-x1.c
@@ -31,7 +31,7 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x1(
     const int32_t va = *input_a++;
     const int32_t vacc = vbias + va * va_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-scalar-x2.c b/src/qs8-vaddc/gen/minmax-scalar-x2.c
index 75b733a8e..cfc8f8ff6 100644
--- a/src/qs8-vaddc/gen/minmax-scalar-x2.c
+++ b/src/qs8-vaddc/gen/minmax-scalar-x2.c
@@ -36,8 +36,8 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x2(
     const int32_t vacc1 = vbias + va1 * va_multiplier;
     input_b += 2;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -56,7 +56,7 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x2(
     const int32_t va = *input_a;
     const int32_t vacc = vbias + va * va_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-scalar-x4.c b/src/qs8-vaddc/gen/minmax-scalar-x4.c
index ee15c3d10..ed1124561 100644
--- a/src/qs8-vaddc/gen/minmax-scalar-x4.c
+++ b/src/qs8-vaddc/gen/minmax-scalar-x4.c
@@ -40,10 +40,10 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x4(
     const int32_t vacc3 = vbias + va3 * va_multiplier;
     input_b += 4;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
-    int32_t vout2 = asr_s32(vacc2, vshift);
-    int32_t vout3 = asr_s32(vacc3, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
+    int32_t vout2 = math_asr_s32(vacc2, vshift);
+    int32_t vout3 = math_asr_s32(vacc3, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -71,7 +71,7 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x4(
       const int32_t va = *input_a++;
       const int32_t vacc = vbias + va * va_multiplier;
 
-      int32_t vout = asr_s32(vacc, vshift);
+      int32_t vout = math_asr_s32(vacc, vshift);
       vout = math_max_s32(vout, voutput_min_less_zero_point);
       vout = math_min_s32(vout, voutput_max_less_zero_point);
       *output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/scalar.c.in b/src/qs8-vaddc/scalar.c.in
index 61109b508..3616ad3c1 100644
--- a/src/qs8-vaddc/scalar.c.in
+++ b/src/qs8-vaddc/scalar.c.in
@@ -31,7 +31,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}(
       const int32_t va = *input_a++;
       const int32_t vacc = vbias + va * va_multiplier;
 
-      int32_t vout = asr_s32(vacc, vshift);
+      int32_t vout = math_asr_s32(vacc, vshift);
       vout = math_max_s32(vout, voutput_min_less_zero_point);
       vout = math_min_s32(vout, voutput_max_less_zero_point);
       *output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -49,7 +49,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}(
       input_b += ${BATCH_TILE};
 
       $for N in range(BATCH_TILE):
-        int32_t vout${N} = asr_s32(vacc${N}, vshift);
+        int32_t vout${N} = math_asr_s32(vacc${N}, vshift);
 
       $for N in range(BATCH_TILE):
         vout${N} = math_max_s32(vout${N}, voutput_min_less_zero_point);
@@ -69,7 +69,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}(
         const int32_t va = *input_a;
         const int32_t vacc = vbias + va * va_multiplier;
 
-        int32_t vout = asr_s32(vacc, vshift);
+        int32_t vout = math_asr_s32(vacc, vshift);
         vout = math_max_s32(vout, voutput_min_less_zero_point);
         vout = math_min_s32(vout, voutput_max_less_zero_point);
         *output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -78,7 +78,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}(
           const int32_t va = *input_a++;
           const int32_t vacc = vbias + va * va_multiplier;
 
-          int32_t vout = asr_s32(vacc, vshift);
+          int32_t vout = math_asr_s32(vacc, vshift);
           vout = math_max_s32(vout, voutput_min_less_zero_point);
           vout = math_min_s32(vout, voutput_max_less_zero_point);
           *output++ = (${XINT8_T}) (vout + voutput_zero_point);
diff --git a/src/qs8-vcvt/armv6simd.c.in b/src/qs8-vcvt/armv6simd.c.in
index 4dd21bee7..ebfffae98 100644
--- a/src/qs8-vcvt/armv6simd.c.in
+++ b/src/qs8-vcvt/armv6simd.c.in
@@ -48,7 +48,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__armv6simd_x${BATCH_TILE}(
         int32_t vacc${ABC[N+3]} = __smlawt(vmultiplier, vx${ABC[N+1]}${ABC[N+3]}, vbias);
 
       $for N in range(BATCH_TILE):
-        vacc${ABC[N]} = ${__XSAT}(asr_s32(vacc${ABC[N]}, 1), 8);
+        vacc${ABC[N]} = ${__XSAT}(math_asr_s32(vacc${ABC[N]}, 1), 8);
 
       $for N in range(BATCH_TILE):
         y[${N}] = (${XINT8_T}) vacc${ABC[N]};
@@ -66,10 +66,10 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__armv6simd_x${BATCH_TILE}(
     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
 
-    vacc0 = ${__XSAT}(asr_s32(vacc0, 1), 8);
-    vacc1 = ${__XSAT}(asr_s32(vacc1, 1), 8);
-    vacc2 = ${__XSAT}(asr_s32(vacc2, 1), 8);
-    vacc3 = ${__XSAT}(asr_s32(vacc3, 1), 8);
+    vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8);
+    vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8);
+    vacc2 = ${__XSAT}(math_asr_s32(vacc2, 1), 8);
+    vacc3 = ${__XSAT}(math_asr_s32(vacc3, 1), 8);
 
     y[0] = (${XINT8_T}) vacc0;
     y[1] = (${XINT8_T}) vacc1;
@@ -87,13 +87,13 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__armv6simd_x${BATCH_TILE}(
     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
 
-    vacc0 = ${__XSAT}(asr_s32(vacc0, 1), 8);
-    vacc1 = ${__XSAT}(asr_s32(vacc1, 1), 8);
+    vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8);
+    vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8);
 
     if (n & (2 * sizeof(${XINT8_T}))) {
       y[0] = (${XINT8_T}) vacc0;
       y[1] = (${XINT8_T}) vacc1;
-      vacc0 = ${__XSAT}(asr_s32(vacc2, 1), 8);
+      vacc0 = ${__XSAT}(math_asr_s32(vacc2, 1), 8);
       y += 2;
     }
     if (n & (1 * sizeof(${XINT8_T}))) {
diff --git a/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c b/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c
index a7eaee4e4..ea76f2893 100644
--- a/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c
+++ b/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c
@@ -38,10 +38,10 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x4(
     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 1), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 1), 8);
-    vacc2 = __ssat(asr_s32(vacc2, 1), 8);
-    vacc3 = __ssat(asr_s32(vacc3, 1), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 1), 8);
+    vacc2 = __ssat(math_asr_s32(vacc2, 1), 8);
+    vacc3 = __ssat(math_asr_s32(vacc3, 1), 8);
 
     y[0] = (int8_t) vacc0;
     y[1] = (int8_t) vacc1;
@@ -59,13 +59,13 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x4(
     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 1), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 1), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 1), 8);
 
     if (n & (2 * sizeof(int8_t))) {
       y[0] = (int8_t) vacc0;
       y[1] = (int8_t) vacc1;
-      vacc0 = __ssat(asr_s32(vacc2, 1), 8);
+      vacc0 = __ssat(math_asr_s32(vacc2, 1), 8);
       y += 2;
     }
     if (n & (1 * sizeof(int8_t))) {
diff --git a/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c b/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c
index 9137dc452..9b3073492 100644
--- a/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c
+++ b/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c
@@ -45,14 +45,14 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x8(
     int32_t vacc6 = __smlawt(vmultiplier, vx46, vbias);
     int32_t vacc7 = __smlawt(vmultiplier, vx57, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 1), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 1), 8);
-    vacc2 = __ssat(asr_s32(vacc2, 1), 8);
-    vacc3 = __ssat(asr_s32(vacc3, 1), 8);
-    vacc4 = __ssat(asr_s32(vacc4, 1), 8);
-    vacc5 = __ssat(asr_s32(vacc5, 1), 8);
-    vacc6 = __ssat(asr_s32(vacc6, 1), 8);
-    vacc7 = __ssat(asr_s32(vacc7, 1), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 1), 8);
+    vacc2 = __ssat(math_asr_s32(vacc2, 1), 8);
+    vacc3 = __ssat(math_asr_s32(vacc3, 1), 8);
+    vacc4 = __ssat(math_asr_s32(vacc4, 1), 8);
+    vacc5 = __ssat(math_asr_s32(vacc5, 1), 8);
+    vacc6 = __ssat(math_asr_s32(vacc6, 1), 8);
+    vacc7 = __ssat(math_asr_s32(vacc7, 1), 8);
 
     y[0] = (int8_t) vacc0;
     y[1] = (int8_t) vacc1;
@@ -76,10 +76,10 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x8(
     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 1), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 1), 8);
-    vacc2 = __ssat(asr_s32(vacc2, 1), 8);
-    vacc3 = __ssat(asr_s32(vacc3, 1), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 1), 8);
+    vacc2 = __ssat(math_asr_s32(vacc2, 1), 8);
+    vacc3 = __ssat(math_asr_s32(vacc3, 1), 8);
 
     y[0] = (int8_t) vacc0;
     y[1] = (int8_t) vacc1;
@@ -97,13 +97,13 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x8(
     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 1), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 1), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 1), 8);
 
     if (n & (2 * sizeof(int8_t))) {
       y[0] = (int8_t) vacc0;
       y[1] = (int8_t) vacc1;
-      vacc0 = __ssat(asr_s32(vacc2, 1), 8);
+      vacc0 = __ssat(math_asr_s32(vacc2, 1), 8);
       y += 2;
     }
     if (n & (1 * sizeof(int8_t))) {
diff --git a/src/qs8-vcvt/gen/vcvt-scalar-x1.c b/src/qs8-vcvt/gen/vcvt-scalar-x1.c
index 211188be4..9424bb751 100644
--- a/src/qs8-vcvt/gen/vcvt-scalar-x1.c
+++ b/src/qs8-vcvt/gen/vcvt-scalar-x1.c
@@ -25,7 +25,7 @@ void xnn_qs8_vcvt_ukernel__scalar_x1(
     int32_t vacc = *x++;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, -128);
     vout = math_min_s32(vout, 127);
     *y++ = (int8_t) vout;
diff --git a/src/qs8-vcvt/gen/vcvt-scalar-x2.c b/src/qs8-vcvt/gen/vcvt-scalar-x2.c
index 93b0327d5..5501229c0 100644
--- a/src/qs8-vcvt/gen/vcvt-scalar-x2.c
+++ b/src/qs8-vcvt/gen/vcvt-scalar-x2.c
@@ -29,8 +29,8 @@ void xnn_qs8_vcvt_ukernel__scalar_x2(
     vacc0 = vbias + vacc0 * vmultiplier;
     vacc1 = vbias + vacc1 * vmultiplier;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
 
     vout0 = math_max_s32(vout0, -128);
     vout1 = math_max_s32(vout1, -128);
@@ -46,7 +46,7 @@ void xnn_qs8_vcvt_ukernel__scalar_x2(
     int32_t vacc = *x;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, -128);
     vout = math_min_s32(vout, 127);
     *y = (int8_t) vout;
diff --git a/src/qs8-vcvt/gen/vcvt-scalar-x4.c b/src/qs8-vcvt/gen/vcvt-scalar-x4.c
index db786c92b..44c2ff627 100644
--- a/src/qs8-vcvt/gen/vcvt-scalar-x4.c
+++ b/src/qs8-vcvt/gen/vcvt-scalar-x4.c
@@ -33,10 +33,10 @@ void xnn_qs8_vcvt_ukernel__scalar_x4(
     vacc2 = vbias + vacc2 * vmultiplier;
     vacc3 = vbias + vacc3 * vmultiplier;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
-    int32_t vout2 = asr_s32(vacc2, 8);
-    int32_t vout3 = asr_s32(vacc3, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
+    int32_t vout2 = math_asr_s32(vacc2, 8);
+    int32_t vout3 = math_asr_s32(vacc3, 8);
 
     vout0 = math_max_s32(vout0, -128);
     vout1 = math_max_s32(vout1, -128);
@@ -59,7 +59,7 @@ void xnn_qs8_vcvt_ukernel__scalar_x4(
       int32_t vacc = *x++;
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, -128);
       vout = math_min_s32(vout, 127);
       *y++ = (int8_t) vout;
diff --git a/src/qs8-vcvt/scalar.c.in b/src/qs8-vcvt/scalar.c.in
index 23270f573..284876e8c 100644
--- a/src/qs8-vcvt/scalar.c.in
+++ b/src/qs8-vcvt/scalar.c.in
@@ -28,7 +28,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}(
       int32_t vacc = *x++;
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, ${OUTPUT_MIN});
       vout = math_min_s32(vout, ${OUTPUT_MAX});
       *y++ = (${XINT8_T}) vout;
@@ -45,7 +45,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}(
         vacc${ABC[N]} = vbias + vacc${ABC[N]} * vmultiplier;
 
       $for N in range(BATCH_TILE):
-        int32_t vout${ABC[N]} = asr_s32(vacc${ABC[N]}, 8);
+        int32_t vout${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 8);
 
       $for N in range(BATCH_TILE):
         vout${ABC[N]} = math_max_s32(vout${ABC[N]}, ${OUTPUT_MIN});
@@ -62,7 +62,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}(
         int32_t vacc = *x;
         vacc = vbias + vacc * vmultiplier;
 
-        int32_t vout = asr_s32(vacc, 8);
+        int32_t vout = math_asr_s32(vacc, 8);
         vout = math_max_s32(vout, ${OUTPUT_MIN});
         vout = math_min_s32(vout, ${OUTPUT_MAX});
         *y = (${XINT8_T}) vout;
@@ -71,7 +71,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}(
           int32_t vacc = *x++;
           vacc = vbias + vacc * vmultiplier;
 
-          int32_t vout = asr_s32(vacc, 8);
+          int32_t vout = math_asr_s32(vacc, 8);
           vout = math_max_s32(vout, ${OUTPUT_MIN});
           vout = math_min_s32(vout, ${OUTPUT_MAX});
           *y++ = (${XINT8_T}) vout;
diff --git a/src/qs8-vlrelu/armv6simd.c.in b/src/qs8-vlrelu/armv6simd.c.in
index d9f3d07ac..302e0c3fc 100644
--- a/src/qs8-vlrelu/armv6simd.c.in
+++ b/src/qs8-vlrelu/armv6simd.c.in
@@ -56,7 +56,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__armv6simd_x${BATCH_TILE}(
         int32_t vacc${ABC[N+3]} = __smlatt(vmultiplier${ABC[N+1]}${ABC[N+3]}, vx${ABC[N+1]}${ABC[N+3]}, vbias);
 
       $for N in range(BATCH_TILE):
-        vacc${ABC[N]} = ${__XSAT}(asr_s32(vacc${ABC[N]}, 8), 8);
+        vacc${ABC[N]} = ${__XSAT}(math_asr_s32(vacc${ABC[N]}, 8), 8);
 
       $for N in range(BATCH_TILE):
         y[${N}] = (${XINT8_T}) vacc${ABC[N]};
@@ -79,10 +79,10 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__armv6simd_x${BATCH_TILE}(
     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
 
-    vacc0 = ${__XSAT}(asr_s32(vacc0, 8), 8);
-    vacc1 = ${__XSAT}(asr_s32(vacc1, 8), 8);
-    vacc2 = ${__XSAT}(asr_s32(vacc2, 8), 8);
-    vacc3 = ${__XSAT}(asr_s32(vacc3, 8), 8);
+    vacc0 = ${__XSAT}(math_asr_s32(vacc0, 8), 8);
+    vacc1 = ${__XSAT}(math_asr_s32(vacc1, 8), 8);
+    vacc2 = ${__XSAT}(math_asr_s32(vacc2, 8), 8);
+    vacc3 = ${__XSAT}(math_asr_s32(vacc3, 8), 8);
 
     y[0] = (${XINT8_T}) vacc0;
     y[1] = (${XINT8_T}) vacc1;
@@ -105,13 +105,13 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__armv6simd_x${BATCH_TILE}(
     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
     const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
 
-    vacc0 = ${__XSAT}(asr_s32(vacc0, 8), 8);
-    vacc1 = ${__XSAT}(asr_s32(vacc1, 8), 8);
+    vacc0 = ${__XSAT}(math_asr_s32(vacc0, 8), 8);
+    vacc1 = ${__XSAT}(math_asr_s32(vacc1, 8), 8);
 
     if (n & (2 * sizeof(${XINT8_T}))) {
       y[0] = (${XINT8_T}) vacc0;
       y[1] = (${XINT8_T}) vacc1;
-      vacc0 = ${__XSAT}(asr_s32(vacc2, 8), 8);
+      vacc0 = ${__XSAT}(math_asr_s32(vacc2, 8), 8);
       y += 2;
     }
     if (n & (1 * sizeof(${XINT8_T}))) {
diff --git a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c
index d99ad2e45..aa298b9b7 100644
--- a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c
+++ b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c
@@ -44,10 +44,10 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x4(
     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 8), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 8), 8);
-    vacc2 = __ssat(asr_s32(vacc2, 8), 8);
-    vacc3 = __ssat(asr_s32(vacc3, 8), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 8), 8);
+    vacc2 = __ssat(math_asr_s32(vacc2, 8), 8);
+    vacc3 = __ssat(math_asr_s32(vacc3, 8), 8);
 
     y[0] = (int8_t) vacc0;
     y[1] = (int8_t) vacc1;
@@ -70,13 +70,13 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x4(
     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
     const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 8), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 8), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 8), 8);
 
     if (n & (2 * sizeof(int8_t))) {
       y[0] = (int8_t) vacc0;
       y[1] = (int8_t) vacc1;
-      vacc0 = __ssat(asr_s32(vacc2, 8), 8);
+      vacc0 = __ssat(math_asr_s32(vacc2, 8), 8);
       y += 2;
     }
     if (n & (1 * sizeof(int8_t))) {
diff --git a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c
index 1d4e233ac..053e92ace 100644
--- a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c
+++ b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c
@@ -55,14 +55,14 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x8(
     int32_t vacc6 = __smlatt(vmultiplier46, vx46, vbias);
     int32_t vacc7 = __smlatt(vmultiplier57, vx57, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 8), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 8), 8);
-    vacc2 = __ssat(asr_s32(vacc2, 8), 8);
-    vacc3 = __ssat(asr_s32(vacc3, 8), 8);
-    vacc4 = __ssat(asr_s32(vacc4, 8), 8);
-    vacc5 = __ssat(asr_s32(vacc5, 8), 8);
-    vacc6 = __ssat(asr_s32(vacc6, 8), 8);
-    vacc7 = __ssat(asr_s32(vacc7, 8), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 8), 8);
+    vacc2 = __ssat(math_asr_s32(vacc2, 8), 8);
+    vacc3 = __ssat(math_asr_s32(vacc3, 8), 8);
+    vacc4 = __ssat(math_asr_s32(vacc4, 8), 8);
+    vacc5 = __ssat(math_asr_s32(vacc5, 8), 8);
+    vacc6 = __ssat(math_asr_s32(vacc6, 8), 8);
+    vacc7 = __ssat(math_asr_s32(vacc7, 8), 8);
 
     y[0] = (int8_t) vacc0;
     y[1] = (int8_t) vacc1;
@@ -91,10 +91,10 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x8(
     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 8), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 8), 8);
-    vacc2 = __ssat(asr_s32(vacc2, 8), 8);
-    vacc3 = __ssat(asr_s32(vacc3, 8), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 8), 8);
+    vacc2 = __ssat(math_asr_s32(vacc2, 8), 8);
+    vacc3 = __ssat(math_asr_s32(vacc3, 8), 8);
 
     y[0] = (int8_t) vacc0;
     y[1] = (int8_t) vacc1;
@@ -117,13 +117,13 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x8(
     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
     const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
 
-    vacc0 = __ssat(asr_s32(vacc0, 8), 8);
-    vacc1 = __ssat(asr_s32(vacc1, 8), 8);
+    vacc0 = __ssat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __ssat(math_asr_s32(vacc1, 8), 8);
 
     if (n & (2 * sizeof(int8_t))) {
       y[0] = (int8_t) vacc0;
       y[1] = (int8_t) vacc1;
-      vacc0 = __ssat(asr_s32(vacc2, 8), 8);
+      vacc0 = __ssat(math_asr_s32(vacc2, 8), 8);
       y += 2;
     }
     if (n & (1 * sizeof(int8_t))) {
diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c
index e5d8fd059..bd790691a 100644
--- a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c
+++ b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c
@@ -25,10 +25,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x1(
   const int32_t vbias = params->scalar_andxor.bias;
   do {
     int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, -128);
     vout = math_min_s32(vout, 127);
     *y++ = (int8_t) vout;
diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c
index 5d52c2724..7e1f97862 100644
--- a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c
+++ b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c
@@ -31,8 +31,8 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x2(
     vacc0 -= vinput_zero_point;
     vacc1 -= vinput_zero_point;
 
-    int32_t vmultiplier0 = asr_s32(vacc0, 31);
-    int32_t vmultiplier1 = asr_s32(vacc1, 31);
+    int32_t vmultiplier0 = math_asr_s32(vacc0, 31);
+    int32_t vmultiplier1 = math_asr_s32(vacc1, 31);
 
     vmultiplier0 &= vmultiplier_diff;
     vmultiplier1 &= vmultiplier_diff;
@@ -43,8 +43,8 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x2(
     vacc0 = vbias + vacc0 * vmultiplier0;
     vacc1 = vbias + vacc1 * vmultiplier1;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
 
     vout0 = math_max_s32(vout0, -128);
     vout1 = math_max_s32(vout1, -128);
@@ -58,10 +58,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x2(
   }
   if XNN_UNLIKELY(n != 0) {
     int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, -128);
     vout = math_min_s32(vout, 127);
     *y = (int8_t) vout;
diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c
index 00043810f..159482303 100644
--- a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c
+++ b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c
@@ -35,10 +35,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x4(
     vacc2 -= vinput_zero_point;
     vacc3 -= vinput_zero_point;
 
-    int32_t vmultiplier0 = asr_s32(vacc0, 31);
-    int32_t vmultiplier1 = asr_s32(vacc1, 31);
-    int32_t vmultiplier2 = asr_s32(vacc2, 31);
-    int32_t vmultiplier3 = asr_s32(vacc3, 31);
+    int32_t vmultiplier0 = math_asr_s32(vacc0, 31);
+    int32_t vmultiplier1 = math_asr_s32(vacc1, 31);
+    int32_t vmultiplier2 = math_asr_s32(vacc2, 31);
+    int32_t vmultiplier3 = math_asr_s32(vacc3, 31);
 
     vmultiplier0 &= vmultiplier_diff;
     vmultiplier1 &= vmultiplier_diff;
@@ -55,10 +55,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x4(
     vacc2 = vbias + vacc2 * vmultiplier2;
     vacc3 = vbias + vacc3 * vmultiplier3;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
-    int32_t vout2 = asr_s32(vacc2, 8);
-    int32_t vout3 = asr_s32(vacc3, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
+    int32_t vout2 = math_asr_s32(vacc2, 8);
+    int32_t vout3 = math_asr_s32(vacc3, 8);
 
     vout0 = math_max_s32(vout0, -128);
     vout1 = math_max_s32(vout1, -128);
@@ -79,10 +79,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x4(
   if XNN_UNLIKELY(n != 0) {
     do {
       int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, -128);
       vout = math_min_s32(vout, 127);
       *y++ = (int8_t) vout;
diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c
index 59f41c68e..ed0ad7235 100644
--- a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c
+++ b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c
@@ -28,7 +28,7 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x1(
     const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, -128);
     vout = math_min_s32(vout, 127);
     *y++ = (int8_t) vout;
diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c
index ffac12f0a..9c9925de0 100644
--- a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c
+++ b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c
@@ -37,8 +37,8 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x2(
     vacc0 = vbias + vacc0 * vmultiplier0;
     vacc1 = vbias + vacc1 * vmultiplier1;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
 
     vout0 = math_max_s32(vout0, -128);
     vout1 = math_max_s32(vout1, -128);
@@ -55,7 +55,7 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x2(
     const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, -128);
     vout = math_min_s32(vout, 127);
     *y = (int8_t) vout;
diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c
index 3e9d93412..480febd29 100644
--- a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c
+++ b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c
@@ -45,10 +45,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x4(
     vacc2 = vbias + vacc2 * vmultiplier2;
     vacc3 = vbias + vacc3 * vmultiplier3;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
-    int32_t vout2 = asr_s32(vacc2, 8);
-    int32_t vout3 = asr_s32(vacc3, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
+    int32_t vout2 = math_asr_s32(vacc2, 8);
+    int32_t vout3 = math_asr_s32(vacc3, 8);
 
     vout0 = math_max_s32(vout0, -128);
     vout1 = math_max_s32(vout1, -128);
@@ -72,7 +72,7 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x4(
       const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, -128);
       vout = math_min_s32(vout, 127);
       *y++ = (int8_t) vout;
diff --git a/src/qs8-vlrelu/scalar-andxor.c.in b/src/qs8-vlrelu/scalar-andxor.c.in
index 36b396d86..54083d7b6 100644
--- a/src/qs8-vlrelu/scalar-andxor.c.in
+++ b/src/qs8-vlrelu/scalar-andxor.c.in
@@ -28,10 +28,10 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}(
   $if BATCH_TILE == 1:
     do {
       int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, ${OUTPUT_MIN});
       vout = math_min_s32(vout, ${OUTPUT_MAX});
       *y++ = (${XINT8_T}) vout;
@@ -48,7 +48,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}(
         vacc${ABC[N]} -= vinput_zero_point;
 
       $for N in range(BATCH_TILE):
-        int32_t vmultiplier${ABC[N]} = asr_s32(vacc${ABC[N]}, 31);
+        int32_t vmultiplier${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 31);
 
       $for N in range(BATCH_TILE):
         vmultiplier${ABC[N]} &= vmultiplier_diff;
@@ -60,7 +60,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}(
         vacc${ABC[N]} = vbias + vacc${ABC[N]} * vmultiplier${ABC[N]};
 
       $for N in range(BATCH_TILE):
-        int32_t vout${ABC[N]} = asr_s32(vacc${ABC[N]}, 8);
+        int32_t vout${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 8);
 
       $for N in range(BATCH_TILE):
         vout${ABC[N]} = math_max_s32(vout${ABC[N]}, ${OUTPUT_MIN});
@@ -75,20 +75,20 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}(
     if XNN_UNLIKELY(n != 0) {
       $if BATCH_TILE == 2:
         int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-        const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+        const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
         vacc = vbias + vacc * vmultiplier;
 
-        int32_t vout = asr_s32(vacc, 8);
+        int32_t vout = math_asr_s32(vacc, 8);
         vout = math_max_s32(vout, ${OUTPUT_MIN});
         vout = math_min_s32(vout, ${OUTPUT_MAX});
         *y = (${XINT8_T}) vout;
       $else:
         do {
           int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-          const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+          const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
           vacc = vbias + vacc * vmultiplier;
 
-          int32_t vout = asr_s32(vacc, 8);
+          int32_t vout = math_asr_s32(vacc, 8);
           vout = math_max_s32(vout, ${OUTPUT_MIN});
           vout = math_min_s32(vout, ${OUTPUT_MAX});
           *y++ = (${XINT8_T}) vout;
diff --git a/src/qs8-vlrelu/scalar-select.c.in b/src/qs8-vlrelu/scalar-select.c.in
index 83723566f..44dc0ada6 100644
--- a/src/qs8-vlrelu/scalar-select.c.in
+++ b/src/qs8-vlrelu/scalar-select.c.in
@@ -31,7 +31,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}(
       const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, ${OUTPUT_MIN});
       vout = math_min_s32(vout, ${OUTPUT_MAX});
       *y++ = (${XINT8_T}) vout;
@@ -54,7 +54,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}(
         vacc${ABC[N]} = vbias + vacc${ABC[N]} * vmultiplier${ABC[N]};
 
       $for N in range(BATCH_TILE):
-        int32_t vout${ABC[N]} = asr_s32(vacc${ABC[N]}, 8);
+        int32_t vout${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 8);
 
       $for N in range(BATCH_TILE):
         vout${ABC[N]} = math_max_s32(vout${ABC[N]}, ${OUTPUT_MIN});
@@ -72,7 +72,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}(
         const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
         vacc = vbias + vacc * vmultiplier;
 
-        int32_t vout = asr_s32(vacc, 8);
+        int32_t vout = math_asr_s32(vacc, 8);
         vout = math_max_s32(vout, ${OUTPUT_MIN});
         vout = math_min_s32(vout, ${OUTPUT_MAX});
         *y = (${XINT8_T}) vout;
@@ -82,7 +82,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}(
           const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
           vacc = vbias + vacc * vmultiplier;
 
-          int32_t vout = asr_s32(vacc, 8);
+          int32_t vout = math_asr_s32(vacc, 8);
           vout = math_max_s32(vout, ${OUTPUT_MIN});
           vout = math_min_s32(vout, ${OUTPUT_MAX});
           *y++ = (${XINT8_T}) vout;
diff --git a/src/qu8-avgpool/9p8x-minmax-scalar-c1.c b/src/qu8-avgpool/9p8x-minmax-scalar-c1.c
index feeb859fc..dab8b0111 100644
--- a/src/qu8-avgpool/9p8x-minmax-scalar-c1.c
+++ b/src/qu8-avgpool/9p8x-minmax-scalar-c1.c
@@ -272,7 +272,7 @@ void xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1(
 
         const int64_t vproduct = (int64_t) vacc * (int64_t) vmultiplier;
         const int64_t vadjusted_product = vproduct - (int64_t) (vacc < 0);
-        int32_t vout = (int32_t) asr_s64(vadjusted_product + vrounding, vshift);
+        int32_t vout = (int32_t) math_asr_s64(vadjusted_product + vrounding, vshift);
         vout = vout < voutput_min ? voutput_min : vout;
         vout = vout > voutput_max ? voutput_max : vout;
         vout += voutput_zero_point;
diff --git a/src/qu8-avgpool/9x-minmax-scalar-c1.c b/src/qu8-avgpool/9x-minmax-scalar-c1.c
index ca66f5cad..127e57226 100644
--- a/src/qu8-avgpool/9x-minmax-scalar-c1.c
+++ b/src/qu8-avgpool/9x-minmax-scalar-c1.c
@@ -133,7 +133,7 @@ void xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1(
 
       const int64_t vproduct = (int64_t) vacc * (int64_t) vmultiplier;
       const int64_t vadjusted_product = vproduct - (int64_t) (vacc < 0);
-      int32_t vout = (int32_t) asr_s64(vadjusted_product + vrounding, vshift);
+      int32_t vout = (int32_t) math_asr_s64(vadjusted_product + vrounding, vshift);
       vout = vout < voutput_min ? voutput_min : vout;
       vout = vout > voutput_max ? voutput_max : vout;
       vout += voutput_zero_point;
diff --git a/src/qu8-requantization/gemmlowp-scalar.c b/src/qu8-requantization/gemmlowp-scalar.c
index 8d6ea0967..ecbbe2647 100644
--- a/src/qu8-requantization/gemmlowp-scalar.c
+++ b/src/qu8-requantization/gemmlowp-scalar.c
@@ -100,10 +100,10 @@ void xnn_qu8_requantize_gemmlowp__scalar(
     const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t) (z_q31product < 0);
     const int32_t w_remainder = (w_q31product & remainder_mask) - (int32_t) (w_q31product < 0);
 
-    const int32_t x_scaled = asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold);
-    const int32_t y_scaled = asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold);
-    const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold);
-    const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
+    const int32_t x_scaled = math_asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold);
+    const int32_t y_scaled = math_asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold);
+    const int32_t z_scaled = math_asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold);
+    const int32_t w_scaled = math_asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
diff --git a/src/qu8-requantization/rndna-scalar-signed64.c b/src/qu8-requantization/rndna-scalar-signed64.c
index 904cf90df..e70c1f2eb 100644
--- a/src/qu8-requantization/rndna-scalar-signed64.c
+++ b/src/qu8-requantization/rndna-scalar-signed64.c
@@ -65,10 +65,10 @@ void xnn_qu8_requantize_rndna__scalar_signed64(
     // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
     // "right shift with rounding" instruction each line below can be represented by just one such instruction
     // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
-    const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift);
-    const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift);
-    const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift);
-    const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift);
+    const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift);
+    const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift);
+    const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift);
+    const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
     const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
diff --git a/src/qu8-vadd/gen/minmax-scalar-x1.c b/src/qu8-vadd/gen/minmax-scalar-x1.c
index 79e3ee526..3fb515adf 100644
--- a/src/qu8-vadd/gen/minmax-scalar-x1.c
+++ b/src/qu8-vadd/gen/minmax-scalar-x1.c
@@ -33,7 +33,7 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x1(
     const int32_t vb = *input_b++;
     const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-scalar-x2.c b/src/qu8-vadd/gen/minmax-scalar-x2.c
index 05203dcb7..f65227db2 100644
--- a/src/qu8-vadd/gen/minmax-scalar-x2.c
+++ b/src/qu8-vadd/gen/minmax-scalar-x2.c
@@ -42,8 +42,8 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x2(
     vacc0 += vb0 * vb_multiplier;
     vacc1 += vb1 * vb_multiplier;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -63,7 +63,7 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x2(
     const int32_t vb = *input_b;
     const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-scalar-x4.c b/src/qu8-vadd/gen/minmax-scalar-x4.c
index f433225f0..95e4148c2 100644
--- a/src/qu8-vadd/gen/minmax-scalar-x4.c
+++ b/src/qu8-vadd/gen/minmax-scalar-x4.c
@@ -50,10 +50,10 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x4(
     vacc2 += vb2 * vb_multiplier;
     vacc3 += vb3 * vb_multiplier;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
-    int32_t vout2 = asr_s32(vacc2, vshift);
-    int32_t vout3 = asr_s32(vacc3, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
+    int32_t vout2 = math_asr_s32(vacc2, vshift);
+    int32_t vout3 = math_asr_s32(vacc3, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -82,7 +82,7 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x4(
       const int32_t vb = *input_b++;
       const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
 
-      int32_t vout = asr_s32(vacc, vshift);
+      int32_t vout = math_asr_s32(vacc, vshift);
       vout = math_max_s32(vout, voutput_min_less_zero_point);
       vout = math_min_s32(vout, voutput_max_less_zero_point);
       *output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-scalar-x1.c b/src/qu8-vaddc/gen/minmax-scalar-x1.c
index 096b2fc7f..22502c59c 100644
--- a/src/qu8-vaddc/gen/minmax-scalar-x1.c
+++ b/src/qu8-vaddc/gen/minmax-scalar-x1.c
@@ -31,7 +31,7 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x1(
     const int32_t va = *input_a++;
     const int32_t vacc = vbias + va * va_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-scalar-x2.c b/src/qu8-vaddc/gen/minmax-scalar-x2.c
index ff775bf14..015532884 100644
--- a/src/qu8-vaddc/gen/minmax-scalar-x2.c
+++ b/src/qu8-vaddc/gen/minmax-scalar-x2.c
@@ -36,8 +36,8 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x2(
     const int32_t vacc1 = vbias + va1 * va_multiplier;
     input_b += 2;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -56,7 +56,7 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x2(
     const int32_t va = *input_a;
     const int32_t vacc = vbias + va * va_multiplier;
 
-    int32_t vout = asr_s32(vacc, vshift);
+    int32_t vout = math_asr_s32(vacc, vshift);
     vout = math_max_s32(vout, voutput_min_less_zero_point);
     vout = math_min_s32(vout, voutput_max_less_zero_point);
     *output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-scalar-x4.c b/src/qu8-vaddc/gen/minmax-scalar-x4.c
index d77b74f0c..fc6fce367 100644
--- a/src/qu8-vaddc/gen/minmax-scalar-x4.c
+++ b/src/qu8-vaddc/gen/minmax-scalar-x4.c
@@ -40,10 +40,10 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x4(
     const int32_t vacc3 = vbias + va3 * va_multiplier;
     input_b += 4;
 
-    int32_t vout0 = asr_s32(vacc0, vshift);
-    int32_t vout1 = asr_s32(vacc1, vshift);
-    int32_t vout2 = asr_s32(vacc2, vshift);
-    int32_t vout3 = asr_s32(vacc3, vshift);
+    int32_t vout0 = math_asr_s32(vacc0, vshift);
+    int32_t vout1 = math_asr_s32(vacc1, vshift);
+    int32_t vout2 = math_asr_s32(vacc2, vshift);
+    int32_t vout3 = math_asr_s32(vacc3, vshift);
 
     vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
     vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -71,7 +71,7 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x4(
       const int32_t va = *input_a++;
       const int32_t vacc = vbias + va * va_multiplier;
 
-      int32_t vout = asr_s32(vacc, vshift);
+      int32_t vout = math_asr_s32(vacc, vshift);
       vout = math_max_s32(vout, voutput_min_less_zero_point);
       vout = math_min_s32(vout, voutput_max_less_zero_point);
       *output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c b/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c
index cafbfd2e6..d78c12a62 100644
--- a/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c
+++ b/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c
@@ -38,10 +38,10 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x4(
     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 1), 8);
-    vacc1 = __usat(asr_s32(vacc1, 1), 8);
-    vacc2 = __usat(asr_s32(vacc2, 1), 8);
-    vacc3 = __usat(asr_s32(vacc3, 1), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
+    vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
+    vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
 
     y[0] = (uint8_t) vacc0;
     y[1] = (uint8_t) vacc1;
@@ -59,13 +59,13 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x4(
     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 1), 8);
-    vacc1 = __usat(asr_s32(vacc1, 1), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
 
     if (n & (2 * sizeof(uint8_t))) {
       y[0] = (uint8_t) vacc0;
       y[1] = (uint8_t) vacc1;
-      vacc0 = __usat(asr_s32(vacc2, 1), 8);
+      vacc0 = __usat(math_asr_s32(vacc2, 1), 8);
       y += 2;
     }
     if (n & (1 * sizeof(uint8_t))) {
diff --git a/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c b/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c
index cb24c6167..bd57725f4 100644
--- a/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c
+++ b/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c
@@ -45,14 +45,14 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x8(
     int32_t vacc6 = __smlawt(vmultiplier, vx46, vbias);
     int32_t vacc7 = __smlawt(vmultiplier, vx57, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 1), 8);
-    vacc1 = __usat(asr_s32(vacc1, 1), 8);
-    vacc2 = __usat(asr_s32(vacc2, 1), 8);
-    vacc3 = __usat(asr_s32(vacc3, 1), 8);
-    vacc4 = __usat(asr_s32(vacc4, 1), 8);
-    vacc5 = __usat(asr_s32(vacc5, 1), 8);
-    vacc6 = __usat(asr_s32(vacc6, 1), 8);
-    vacc7 = __usat(asr_s32(vacc7, 1), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
+    vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
+    vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
+    vacc4 = __usat(math_asr_s32(vacc4, 1), 8);
+    vacc5 = __usat(math_asr_s32(vacc5, 1), 8);
+    vacc6 = __usat(math_asr_s32(vacc6, 1), 8);
+    vacc7 = __usat(math_asr_s32(vacc7, 1), 8);
 
     y[0] = (uint8_t) vacc0;
     y[1] = (uint8_t) vacc1;
@@ -76,10 +76,10 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x8(
     int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
     int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 1), 8);
-    vacc1 = __usat(asr_s32(vacc1, 1), 8);
-    vacc2 = __usat(asr_s32(vacc2, 1), 8);
-    vacc3 = __usat(asr_s32(vacc3, 1), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
+    vacc2 = __usat(math_asr_s32(vacc2, 1), 8);
+    vacc3 = __usat(math_asr_s32(vacc3, 1), 8);
 
     y[0] = (uint8_t) vacc0;
     y[1] = (uint8_t) vacc1;
@@ -97,13 +97,13 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x8(
     int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias);
     const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 1), 8);
-    vacc1 = __usat(asr_s32(vacc1, 1), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 1), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 1), 8);
 
     if (n & (2 * sizeof(uint8_t))) {
       y[0] = (uint8_t) vacc0;
       y[1] = (uint8_t) vacc1;
-      vacc0 = __usat(asr_s32(vacc2, 1), 8);
+      vacc0 = __usat(math_asr_s32(vacc2, 1), 8);
       y += 2;
     }
     if (n & (1 * sizeof(uint8_t))) {
diff --git a/src/qu8-vcvt/gen/vcvt-scalar-x1.c b/src/qu8-vcvt/gen/vcvt-scalar-x1.c
index b4fc8fa63..9c99ce346 100644
--- a/src/qu8-vcvt/gen/vcvt-scalar-x1.c
+++ b/src/qu8-vcvt/gen/vcvt-scalar-x1.c
@@ -25,7 +25,7 @@ void xnn_qu8_vcvt_ukernel__scalar_x1(
     int32_t vacc = *x++;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, 0);
     vout = math_min_s32(vout, 255);
     *y++ = (uint8_t) vout;
diff --git a/src/qu8-vcvt/gen/vcvt-scalar-x2.c b/src/qu8-vcvt/gen/vcvt-scalar-x2.c
index 1ceacd9b2..f5399fbf1 100644
--- a/src/qu8-vcvt/gen/vcvt-scalar-x2.c
+++ b/src/qu8-vcvt/gen/vcvt-scalar-x2.c
@@ -29,8 +29,8 @@ void xnn_qu8_vcvt_ukernel__scalar_x2(
     vacc0 = vbias + vacc0 * vmultiplier;
     vacc1 = vbias + vacc1 * vmultiplier;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
 
     vout0 = math_max_s32(vout0, 0);
     vout1 = math_max_s32(vout1, 0);
@@ -46,7 +46,7 @@ void xnn_qu8_vcvt_ukernel__scalar_x2(
     int32_t vacc = *x;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, 0);
     vout = math_min_s32(vout, 255);
     *y = (uint8_t) vout;
diff --git a/src/qu8-vcvt/gen/vcvt-scalar-x4.c b/src/qu8-vcvt/gen/vcvt-scalar-x4.c
index f1568973b..d24df8932 100644
--- a/src/qu8-vcvt/gen/vcvt-scalar-x4.c
+++ b/src/qu8-vcvt/gen/vcvt-scalar-x4.c
@@ -33,10 +33,10 @@ void xnn_qu8_vcvt_ukernel__scalar_x4(
     vacc2 = vbias + vacc2 * vmultiplier;
     vacc3 = vbias + vacc3 * vmultiplier;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
-    int32_t vout2 = asr_s32(vacc2, 8);
-    int32_t vout3 = asr_s32(vacc3, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
+    int32_t vout2 = math_asr_s32(vacc2, 8);
+    int32_t vout3 = math_asr_s32(vacc3, 8);
 
     vout0 = math_max_s32(vout0, 0);
     vout1 = math_max_s32(vout1, 0);
@@ -59,7 +59,7 @@ void xnn_qu8_vcvt_ukernel__scalar_x4(
       int32_t vacc = *x++;
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, 0);
       vout = math_min_s32(vout, 255);
       *y++ = (uint8_t) vout;
diff --git a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c
index 7cc3c3067..6f0487e7f 100644
--- a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c
+++ b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c
@@ -44,10 +44,10 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x4(
     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 8), 8);
-    vacc1 = __usat(asr_s32(vacc1, 8), 8);
-    vacc2 = __usat(asr_s32(vacc2, 8), 8);
-    vacc3 = __usat(asr_s32(vacc3, 8), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
+    vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
+    vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
 
     y[0] = (uint8_t) vacc0;
     y[1] = (uint8_t) vacc1;
@@ -70,13 +70,13 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x4(
     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
     const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 8), 8);
-    vacc1 = __usat(asr_s32(vacc1, 8), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
 
     if (n & (2 * sizeof(uint8_t))) {
       y[0] = (uint8_t) vacc0;
       y[1] = (uint8_t) vacc1;
-      vacc0 = __usat(asr_s32(vacc2, 8), 8);
+      vacc0 = __usat(math_asr_s32(vacc2, 8), 8);
       y += 2;
     }
     if (n & (1 * sizeof(uint8_t))) {
diff --git a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c
index 565bb1e5f..8f3142080 100644
--- a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c
+++ b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c
@@ -55,14 +55,14 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x8(
     int32_t vacc6 = __smlatt(vmultiplier46, vx46, vbias);
     int32_t vacc7 = __smlatt(vmultiplier57, vx57, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 8), 8);
-    vacc1 = __usat(asr_s32(vacc1, 8), 8);
-    vacc2 = __usat(asr_s32(vacc2, 8), 8);
-    vacc3 = __usat(asr_s32(vacc3, 8), 8);
-    vacc4 = __usat(asr_s32(vacc4, 8), 8);
-    vacc5 = __usat(asr_s32(vacc5, 8), 8);
-    vacc6 = __usat(asr_s32(vacc6, 8), 8);
-    vacc7 = __usat(asr_s32(vacc7, 8), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
+    vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
+    vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
+    vacc4 = __usat(math_asr_s32(vacc4, 8), 8);
+    vacc5 = __usat(math_asr_s32(vacc5, 8), 8);
+    vacc6 = __usat(math_asr_s32(vacc6, 8), 8);
+    vacc7 = __usat(math_asr_s32(vacc7, 8), 8);
 
     y[0] = (uint8_t) vacc0;
     y[1] = (uint8_t) vacc1;
@@ -91,10 +91,10 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x8(
     int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
     int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 8), 8);
-    vacc1 = __usat(asr_s32(vacc1, 8), 8);
-    vacc2 = __usat(asr_s32(vacc2, 8), 8);
-    vacc3 = __usat(asr_s32(vacc3, 8), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
+    vacc2 = __usat(math_asr_s32(vacc2, 8), 8);
+    vacc3 = __usat(math_asr_s32(vacc3, 8), 8);
 
     y[0] = (uint8_t) vacc0;
     y[1] = (uint8_t) vacc1;
@@ -117,13 +117,13 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x8(
     int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias);
     const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias);
 
-    vacc0 = __usat(asr_s32(vacc0, 8), 8);
-    vacc1 = __usat(asr_s32(vacc1, 8), 8);
+    vacc0 = __usat(math_asr_s32(vacc0, 8), 8);
+    vacc1 = __usat(math_asr_s32(vacc1, 8), 8);
 
     if (n & (2 * sizeof(uint8_t))) {
       y[0] = (uint8_t) vacc0;
       y[1] = (uint8_t) vacc1;
-      vacc0 = __usat(asr_s32(vacc2, 8), 8);
+      vacc0 = __usat(math_asr_s32(vacc2, 8), 8);
       y += 2;
     }
     if (n & (1 * sizeof(uint8_t))) {
diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c
index 33e9176d7..c0ce8a70b 100644
--- a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c
+++ b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c
@@ -25,10 +25,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x1(
   const int32_t vbias = params->scalar_andxor.bias;
   do {
     int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, 0);
     vout = math_min_s32(vout, 255);
     *y++ = (uint8_t) vout;
diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c
index 8e4f64fab..81eb91fdf 100644
--- a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c
+++ b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c
@@ -31,8 +31,8 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x2(
     vacc0 -= vinput_zero_point;
     vacc1 -= vinput_zero_point;
 
-    int32_t vmultiplier0 = asr_s32(vacc0, 31);
-    int32_t vmultiplier1 = asr_s32(vacc1, 31);
+    int32_t vmultiplier0 = math_asr_s32(vacc0, 31);
+    int32_t vmultiplier1 = math_asr_s32(vacc1, 31);
 
     vmultiplier0 &= vmultiplier_diff;
     vmultiplier1 &= vmultiplier_diff;
@@ -43,8 +43,8 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x2(
     vacc0 = vbias + vacc0 * vmultiplier0;
     vacc1 = vbias + vacc1 * vmultiplier1;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
 
     vout0 = math_max_s32(vout0, 0);
     vout1 = math_max_s32(vout1, 0);
@@ -58,10 +58,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x2(
   }
   if XNN_UNLIKELY(n != 0) {
     int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+    const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, 0);
     vout = math_min_s32(vout, 255);
     *y = (uint8_t) vout;
diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c
index 7b52bb64e..da80ee8bb 100644
--- a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c
+++ b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c
@@ -35,10 +35,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x4(
     vacc2 -= vinput_zero_point;
     vacc3 -= vinput_zero_point;
 
-    int32_t vmultiplier0 = asr_s32(vacc0, 31);
-    int32_t vmultiplier1 = asr_s32(vacc1, 31);
-    int32_t vmultiplier2 = asr_s32(vacc2, 31);
-    int32_t vmultiplier3 = asr_s32(vacc3, 31);
+    int32_t vmultiplier0 = math_asr_s32(vacc0, 31);
+    int32_t vmultiplier1 = math_asr_s32(vacc1, 31);
+    int32_t vmultiplier2 = math_asr_s32(vacc2, 31);
+    int32_t vmultiplier3 = math_asr_s32(vacc3, 31);
 
     vmultiplier0 &= vmultiplier_diff;
     vmultiplier1 &= vmultiplier_diff;
@@ -55,10 +55,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x4(
     vacc2 = vbias + vacc2 * vmultiplier2;
     vacc3 = vbias + vacc3 * vmultiplier3;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
-    int32_t vout2 = asr_s32(vacc2, 8);
-    int32_t vout3 = asr_s32(vacc3, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
+    int32_t vout2 = math_asr_s32(vacc2, 8);
+    int32_t vout3 = math_asr_s32(vacc3, 8);
 
     vout0 = math_max_s32(vout0, 0);
     vout1 = math_max_s32(vout1, 0);
@@ -79,10 +79,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x4(
   if XNN_UNLIKELY(n != 0) {
     do {
       int32_t vacc = (int32_t) *x++ - vinput_zero_point;
-      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31));
+      const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31));
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, 0);
       vout = math_min_s32(vout, 255);
       *y++ = (uint8_t) vout;
diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c
index 8db62d4ba..479336be4 100644
--- a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c
+++ b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c
@@ -28,7 +28,7 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x1(
     const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, 0);
     vout = math_min_s32(vout, 255);
     *y++ = (uint8_t) vout;
diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c
index eb6164b26..2ba144e7f 100644
--- a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c
+++ b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c
@@ -37,8 +37,8 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x2(
     vacc0 = vbias + vacc0 * vmultiplier0;
     vacc1 = vbias + vacc1 * vmultiplier1;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
 
     vout0 = math_max_s32(vout0, 0);
     vout1 = math_max_s32(vout1, 0);
@@ -55,7 +55,7 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x2(
     const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
     vacc = vbias + vacc * vmultiplier;
 
-    int32_t vout = asr_s32(vacc, 8);
+    int32_t vout = math_asr_s32(vacc, 8);
     vout = math_max_s32(vout, 0);
     vout = math_min_s32(vout, 255);
     *y = (uint8_t) vout;
diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c
index 6d5f36165..4300bd2c0 100644
--- a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c
+++ b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c
@@ -45,10 +45,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x4(
     vacc2 = vbias + vacc2 * vmultiplier2;
     vacc3 = vbias + vacc3 * vmultiplier3;
 
-    int32_t vout0 = asr_s32(vacc0, 8);
-    int32_t vout1 = asr_s32(vacc1, 8);
-    int32_t vout2 = asr_s32(vacc2, 8);
-    int32_t vout3 = asr_s32(vacc3, 8);
+    int32_t vout0 = math_asr_s32(vacc0, 8);
+    int32_t vout1 = math_asr_s32(vacc1, 8);
+    int32_t vout2 = math_asr_s32(vacc2, 8);
+    int32_t vout3 = math_asr_s32(vacc3, 8);
 
     vout0 = math_max_s32(vout0, 0);
     vout1 = math_max_s32(vout1, 0);
@@ -72,7 +72,7 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x4(
       const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier;
       vacc = vbias + vacc * vmultiplier;
 
-      int32_t vout = asr_s32(vacc, 8);
+      int32_t vout = math_asr_s32(vacc, 8);
       vout = math_max_s32(vout, 0);
       vout = math_min_s32(vout, 255);
       *y++ = (uint8_t) vout;
diff --git a/src/s16-window/gen/scalar-x1.c b/src/s16-window/gen/scalar-x1.c
index fcf4323b7..623f08390 100644
--- a/src/s16-window/gen/scalar-x1.c
+++ b/src/s16-window/gen/scalar-x1.c
@@ -39,7 +39,7 @@ void xnn_s16_window_ukernel__scalar_x1(
         int32_t vout = ((int32_t) input[0] * (int32_t) w[0]);
         ++input;
         ++w;
-        vout = asr_s32(vout, shift);
+        vout = math_asr_s32(vout, shift);
         vout = math_max_s32(vout, INT16_MIN);
         vout = math_min_s32(vout, INT16_MAX);
         output[0] = (int16_t)(vout);
diff --git a/src/s16-window/gen/scalar-x2.c b/src/s16-window/gen/scalar-x2.c
index d0dfd5928..39a5b48ad 100644
--- a/src/s16-window/gen/scalar-x2.c
+++ b/src/s16-window/gen/scalar-x2.c
@@ -45,8 +45,8 @@ void xnn_s16_window_ukernel__scalar_x2(
       int32_t vout0 = (int32_t) vi0 * (int32_t) w0;
       int32_t vout1 = (int32_t) vi1 * (int32_t) w1;
 
-      vout0 = asr_s32(vout0, shift);
-      vout1 = asr_s32(vout1, shift);
+      vout0 = math_asr_s32(vout0, shift);
+      vout1 = math_asr_s32(vout1, shift);
 
       vout0 = math_max_s32(vout0, INT16_MIN);
       vout1 = math_max_s32(vout1, INT16_MIN);
@@ -65,7 +65,7 @@ void xnn_s16_window_ukernel__scalar_x2(
         int32_t vout = ((int32_t) input[0] * (int32_t) w[0]);
         ++input;
         ++w;
-        vout = asr_s32(vout, shift);
+        vout = math_asr_s32(vout, shift);
         vout = math_max_s32(vout, INT16_MIN);
         vout = math_min_s32(vout, INT16_MAX);
         output[0] = (int16_t)(vout);
diff --git a/src/s16-window/gen/scalar-x3.c b/src/s16-window/gen/scalar-x3.c
index ab5901e8c..6bd194224 100644
--- a/src/s16-window/gen/scalar-x3.c
+++ b/src/s16-window/gen/scalar-x3.c
@@ -48,9 +48,9 @@ void xnn_s16_window_ukernel__scalar_x3(
       int32_t vout1 = (int32_t) vi1 * (int32_t) w1;
       int32_t vout2 = (int32_t) vi2 * (int32_t) w2;
 
-      vout0 = asr_s32(vout0, shift);
-      vout1 = asr_s32(vout1, shift);
-      vout2 = asr_s32(vout2, shift);
+      vout0 = math_asr_s32(vout0, shift);
+      vout1 = math_asr_s32(vout1, shift);
+      vout2 = math_asr_s32(vout2, shift);
 
       vout0 = math_max_s32(vout0, INT16_MIN);
       vout1 = math_max_s32(vout1, INT16_MIN);
@@ -72,7 +72,7 @@ void xnn_s16_window_ukernel__scalar_x3(
         int32_t vout = ((int32_t) input[0] * (int32_t) w[0]);
         ++input;
         ++w;
-        vout = asr_s32(vout, shift);
+        vout = math_asr_s32(vout, shift);
         vout = math_max_s32(vout, INT16_MIN);
         vout = math_min_s32(vout, INT16_MAX);
         output[0] = (int16_t)(vout);
diff --git a/src/s16-window/gen/scalar-x4.c b/src/s16-window/gen/scalar-x4.c
index 100a30af6..081712590 100644
--- a/src/s16-window/gen/scalar-x4.c
+++ b/src/s16-window/gen/scalar-x4.c
@@ -51,10 +51,10 @@ void xnn_s16_window_ukernel__scalar_x4(
       int32_t vout2 = (int32_t) vi2 * (int32_t) w2;
       int32_t vout3 = (int32_t) vi3 * (int32_t) w3;
 
-      vout0 = asr_s32(vout0, shift);
-      vout1 = asr_s32(vout1, shift);
-      vout2 = asr_s32(vout2, shift);
-      vout3 = asr_s32(vout3, shift);
+      vout0 = math_asr_s32(vout0, shift);
+      vout1 = math_asr_s32(vout1, shift);
+      vout2 = math_asr_s32(vout2, shift);
+      vout3 = math_asr_s32(vout3, shift);
 
       vout0 = math_max_s32(vout0, INT16_MIN);
       vout1 = math_max_s32(vout1, INT16_MIN);
@@ -79,7 +79,7 @@ void xnn_s16_window_ukernel__scalar_x4(
         int32_t vout = ((int32_t) input[0] * (int32_t) w[0]);
         ++input;
         ++w;
-        vout = asr_s32(vout, shift);
+        vout = math_asr_s32(vout, shift);
         vout = math_max_s32(vout, INT16_MIN);
         vout = math_min_s32(vout, INT16_MAX);
         output[0] = (int16_t)(vout);
diff --git a/src/s16-window/scalar.c.in b/src/s16-window/scalar.c.in
index d88835f5d..229cf9a8c 100644
--- a/src/s16-window/scalar.c.in
+++ b/src/s16-window/scalar.c.in
@@ -44,7 +44,7 @@ void xnn_s16_window_ukernel__scalar_x${CHANNEL_TILE}(
           int32_t vout${C} = (int32_t) vi${C} * (int32_t) w${C};
 
         $for C in range(CHANNEL_TILE):
-          vout${C} = asr_s32(vout${C}, shift);
+          vout${C} = math_asr_s32(vout${C}, shift);
 
         $for C in range(CHANNEL_TILE):
           vout${C} = math_max_s32(vout${C}, INT16_MIN);
@@ -63,7 +63,7 @@ void xnn_s16_window_ukernel__scalar_x${CHANNEL_TILE}(
         int32_t vout = ((int32_t) input[0] * (int32_t) w[0]);
         ++input;
         ++w;
-        vout = asr_s32(vout, shift);
+        vout = math_asr_s32(vout, shift);
         vout = math_max_s32(vout, INT16_MIN);
         vout = math_min_s32(vout, INT16_MAX);
         output[0] = (int16_t)(vout);
diff --git a/src/s8-ibilinear/gen/scalar-c1.c b/src/s8-ibilinear/gen/scalar-c1.c
index 4d3c19421..f2246e766 100644
--- a/src/s8-ibilinear/gen/scalar-c1.c
+++ b/src/s8-ibilinear/gen/scalar-c1.c
@@ -55,7 +55,7 @@ void xnn_s8_ibilinear_ukernel__scalar_c1(
 
       const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-      const int32_t vo = asr_s32(vacc + vrounding, 22);
+      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
       *output++ = vo;
 
diff --git a/src/s8-ibilinear/gen/scalar-c2.c b/src/s8-ibilinear/gen/scalar-c2.c
index d435fa0ac..b1918ddb0 100644
--- a/src/s8-ibilinear/gen/scalar-c2.c
+++ b/src/s8-ibilinear/gen/scalar-c2.c
@@ -69,8 +69,8 @@ void xnn_s8_ibilinear_ukernel__scalar_c2(
       const int32_t vacc0 = (int32_t) ((uint32_t) vt0 << 11) + vd0 * valphav;
       const int32_t vacc1 = (int32_t) ((uint32_t) vt1 << 11) + vd1 * valphav;
 
-      const int32_t vo0 = asr_s32(vacc0 + vrounding, 22);
-      const int32_t vo1 = asr_s32(vacc1 + vrounding, 22);
+      const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22);
+      const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22);
 
       output[0] = (int8_t) vo0;
       output[1] = (int8_t) vo1;
@@ -92,7 +92,7 @@ void xnn_s8_ibilinear_ukernel__scalar_c2(
 
       const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-      const int32_t vo = asr_s32(vacc + vrounding, 22);
+      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
       *output++ = vo;
     }
diff --git a/src/s8-ibilinear/gen/scalar-c4.c b/src/s8-ibilinear/gen/scalar-c4.c
index 8ff553f03..b92c079df 100644
--- a/src/s8-ibilinear/gen/scalar-c4.c
+++ b/src/s8-ibilinear/gen/scalar-c4.c
@@ -89,10 +89,10 @@ void xnn_s8_ibilinear_ukernel__scalar_c4(
       const int32_t vacc2 = (int32_t) ((uint32_t) vt2 << 11) + vd2 * valphav;
       const int32_t vacc3 = (int32_t) ((uint32_t) vt3 << 11) + vd3 * valphav;
 
-      const int32_t vo0 = asr_s32(vacc0 + vrounding, 22);
-      const int32_t vo1 = asr_s32(vacc1 + vrounding, 22);
-      const int32_t vo2 = asr_s32(vacc2 + vrounding, 22);
-      const int32_t vo3 = asr_s32(vacc3 + vrounding, 22);
+      const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22);
+      const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22);
+      const int32_t vo2 = math_asr_s32(vacc2 + vrounding, 22);
+      const int32_t vo3 = math_asr_s32(vacc3 + vrounding, 22);
 
       output[0] = (int8_t) vo0;
       output[1] = (int8_t) vo1;
@@ -116,7 +116,7 @@ void xnn_s8_ibilinear_ukernel__scalar_c4(
 
       const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-      const int32_t vo = asr_s32(vacc + vrounding, 22);
+      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
       *output++ = vo;
     }
diff --git a/src/s8-ibilinear/scalar.c.in b/src/s8-ibilinear/scalar.c.in
index 266d7fefb..b29d08a7d 100644
--- a/src/s8-ibilinear/scalar.c.in
+++ b/src/s8-ibilinear/scalar.c.in
@@ -66,7 +66,7 @@ void xnn_${DATATYPE.lower()}_ibilinear_ukernel__scalar_c${CHANNEL_TILE}${"" if P
           const int32_t vacc${ABC[C]} = (int32_t) ((uint32_t) vt${ABC[C]} << 11) + vd${ABC[C]} * valphav;
 
         $for C in range(CHANNEL_TILE):
-          const int32_t vo${ABC[C]} = asr_s32(vacc${ABC[C]} + vrounding, 22);
+          const int32_t vo${ABC[C]} = math_asr_s32(vacc${ABC[C]} + vrounding, 22);
 
         $for C in range(CHANNEL_TILE):
           output[${C}] = (${XINT8_T}) vo${ABC[C]};
@@ -88,7 +88,7 @@ void xnn_${DATATYPE.lower()}_ibilinear_ukernel__scalar_c${CHANNEL_TILE}${"" if P
 
         const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-        const int32_t vo = asr_s32(vacc + vrounding, 22);
+        const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
         *output++ = vo;
       }
@@ -109,7 +109,7 @@ void xnn_${DATATYPE.lower()}_ibilinear_ukernel__scalar_c${CHANNEL_TILE}${"" if P
 
         const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-        const int32_t vo = asr_s32(vacc + vrounding, 22);
+        const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
         *output++ = vo;
 
diff --git a/src/u8-ibilinear/gen/scalar-c1.c b/src/u8-ibilinear/gen/scalar-c1.c
index a17287fc6..de03b9a01 100644
--- a/src/u8-ibilinear/gen/scalar-c1.c
+++ b/src/u8-ibilinear/gen/scalar-c1.c
@@ -55,7 +55,7 @@ void xnn_u8_ibilinear_ukernel__scalar_c1(
 
       const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-      const int32_t vo = asr_s32(vacc + vrounding, 22);
+      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
       *output++ = vo;
 
diff --git a/src/u8-ibilinear/gen/scalar-c2.c b/src/u8-ibilinear/gen/scalar-c2.c
index 33b18d821..5f398122c 100644
--- a/src/u8-ibilinear/gen/scalar-c2.c
+++ b/src/u8-ibilinear/gen/scalar-c2.c
@@ -69,8 +69,8 @@ void xnn_u8_ibilinear_ukernel__scalar_c2(
       const int32_t vacc0 = (int32_t) ((uint32_t) vt0 << 11) + vd0 * valphav;
       const int32_t vacc1 = (int32_t) ((uint32_t) vt1 << 11) + vd1 * valphav;
 
-      const int32_t vo0 = asr_s32(vacc0 + vrounding, 22);
-      const int32_t vo1 = asr_s32(vacc1 + vrounding, 22);
+      const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22);
+      const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22);
 
       output[0] = (uint8_t) vo0;
       output[1] = (uint8_t) vo1;
@@ -92,7 +92,7 @@ void xnn_u8_ibilinear_ukernel__scalar_c2(
 
       const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-      const int32_t vo = asr_s32(vacc + vrounding, 22);
+      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
       *output++ = vo;
     }
diff --git a/src/u8-ibilinear/gen/scalar-c4.c b/src/u8-ibilinear/gen/scalar-c4.c
index 318995666..438cfe3ce 100644
--- a/src/u8-ibilinear/gen/scalar-c4.c
+++ b/src/u8-ibilinear/gen/scalar-c4.c
@@ -89,10 +89,10 @@ void xnn_u8_ibilinear_ukernel__scalar_c4(
       const int32_t vacc2 = (int32_t) ((uint32_t) vt2 << 11) + vd2 * valphav;
       const int32_t vacc3 = (int32_t) ((uint32_t) vt3 << 11) + vd3 * valphav;
 
-      const int32_t vo0 = asr_s32(vacc0 + vrounding, 22);
-      const int32_t vo1 = asr_s32(vacc1 + vrounding, 22);
-      const int32_t vo2 = asr_s32(vacc2 + vrounding, 22);
-      const int32_t vo3 = asr_s32(vacc3 + vrounding, 22);
+      const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22);
+      const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22);
+      const int32_t vo2 = math_asr_s32(vacc2 + vrounding, 22);
+      const int32_t vo3 = math_asr_s32(vacc3 + vrounding, 22);
 
       output[0] = (uint8_t) vo0;
       output[1] = (uint8_t) vo1;
@@ -116,7 +116,7 @@ void xnn_u8_ibilinear_ukernel__scalar_c4(
 
       const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav;
 
-      const int32_t vo = asr_s32(vacc + vrounding, 22);
+      const int32_t vo = math_asr_s32(vacc + vrounding, 22);
 
       *output++ = vo;
     }
diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h
index 982af7638..deefacf6f 100644
--- a/src/xnnpack/math.h
+++ b/src/xnnpack/math.h
@@ -155,7 +155,7 @@ XNN_INLINE static float math_nonsign_mask_f32() {
 #endif
 
 XNN_IGNORE_SHIFT_BASE_UB
-XNN_INLINE static int32_t asr_s32(int32_t x, uint32_t n) {
+XNN_INLINE static int32_t math_asr_s32(int32_t x, uint32_t n) {
   #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND
     #if XNN_ARCH_X86_64 || XNN_ARCH_ARM64
       return (int32_t) ((uint64_t) (int64_t) x >> n);
@@ -168,7 +168,7 @@ XNN_INLINE static int32_t asr_s32(int32_t x, uint32_t n) {
 }
 
 XNN_IGNORE_SHIFT_BASE_UB
-XNN_INLINE static int64_t asr_s64(int64_t x, uint32_t n) {
+XNN_INLINE static int64_t math_asr_s64(int64_t x, uint32_t n) {
   #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND
     return x >= 0 ? x >> n : ~(~x >> n);
   #else
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index 18ddb9c44..a556acd33 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -173,7 +173,7 @@ static inline int8_t xnn_qs8_requantize_rndnu(
   const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point;
 
   const int64_t abs_prescaled_input = (int64_t) input * (int64_t) multiplier;
-  int32_t output = (int32_t) asr_s64(abs_prescaled_input + rounding, shift);
+  int32_t output = (int32_t) math_asr_s64(abs_prescaled_input + rounding, shift);
   output = math_max_s32(output, min_less_zero_point);
   output = math_min_s32(output, max_less_zero_point);
   return (int8_t) (output + (int32_t) zero_point);
@@ -200,7 +200,7 @@ static inline uint8_t xnn_qu8_requantize_rndnu(
   const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point;
 
   const int64_t abs_prescaled_input = (int64_t) input * (int64_t) multiplier;
-  int32_t output = (int32_t) asr_s64(abs_prescaled_input + rounding, shift);
+  int32_t output = (int32_t) math_asr_s64(abs_prescaled_input + rounding, shift);
   output = math_max_s32(output, min_less_zero_point);
   output = math_min_s32(output, max_less_zero_point);
   return (uint8_t) (output + (int32_t) zero_point);
@@ -214,7 +214,7 @@ static inline uint8_t xnn_qu8_quantize_add(
   int32_t acc = params.scalar.bias + (int32_t) (uint32_t) a * params.scalar.a_multiplier + (int32_t) (uint32_t) b * params.scalar.b_multiplier;
 
   // Shift right with rounding away from zero.
-  acc = asr_s32(acc, params.scalar.shift);
+  acc = math_asr_s32(acc, params.scalar.shift);
 
   // Clamp and add output zero point.
   acc = math_max_s32(acc, params.scalar.output_min_less_zero_point);
@@ -230,7 +230,7 @@ static inline int8_t xnn_qs8_quantize_add(
   int32_t acc = params.scalar.bias + (int32_t) a * params.scalar.a_multiplier + (int32_t) b * params.scalar.b_multiplier;
 
   // Shift right with rounding away from zero.
-  acc = asr_s32(acc, params.scalar.shift);
+  acc = math_asr_s32(acc, params.scalar.shift);
 
   // Clamp and add output zero point.
   acc = math_max_s32(acc, params.scalar.output_min_less_zero_point);
diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h
index a11a7d267..34cc4acbd 100644
--- a/test/ibilinear-microkernel-tester.h
+++ b/test/ibilinear-microkernel-tester.h
@@ -229,7 +229,7 @@ class IBilinearMicrokernelTester {
         for (size_t c = 0; c < channels(); c++) {
           const int32_t alpha_h = packed_weights[i * 2 + 0];
           const int32_t alpha_v = packed_weights[i * 2 + 1];
-          const int32_t acc = asr_s32(
+          const int32_t acc = math_asr_s32(
             int32_t(indirection[i * 4 + 0][c + input_offset()]) * (2048 - alpha_h) * (2048 - alpha_v) +
             int32_t(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (2048 - alpha_v) +
             int32_t(indirection[i * 4 + 2][c + input_offset()]) * (2048 - alpha_h) * alpha_v +
diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h
index 1609cbb21..0961fbda4 100644
--- a/test/vcvt-microkernel-tester.h
+++ b/test/vcvt-microkernel-tester.h
@@ -266,7 +266,7 @@ class VCvtMicrokernelTester {
       const int32_t multiplier = (int32_t) lrintf(-256.0f * scale());
       for (size_t i = 0; i < batch_size(); i++) {
         const int32_t input_value = (input_zero_point() - input[i]) << 7;
-        int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
+        int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
         output_value = std::min<int32_t>(output_value, std::numeric_limits<int8_t>::max());
         output_value = std::max<int32_t>(output_value, std::numeric_limits<int8_t>::min());
         output_ref[i] = static_cast<int8_t>(output_value);
@@ -345,7 +345,7 @@ class VCvtMicrokernelTester {
       const int32_t multiplier = (int32_t) lrintf(-256.0f * scale());
       for (size_t i = 0; i < batch_size(); i++) {
         const int32_t input_value = (input_zero_point() - input[i]) << 7;
-        int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
+        int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
         output_value = std::min<int32_t>(output_value, std::numeric_limits<uint8_t>::max());
         output_value = std::max<int32_t>(output_value, std::numeric_limits<uint8_t>::min());
         output_ref[i] = static_cast<uint8_t>(output_value);
diff --git a/test/vlrelu-microkernel-tester.h b/test/vlrelu-microkernel-tester.h
index d73fdbf9b..ec9ed66ba 100644
--- a/test/vlrelu-microkernel-tester.h
+++ b/test/vlrelu-microkernel-tester.h
@@ -113,7 +113,7 @@ class VLReLUMicrokernelTester {
       for (size_t i = 0; i < batch_size(); i++) {
         const int32_t input_value = (input_zero_point() - input[i]) << 7;
         const int32_t multiplier = input_value <= 0 ? positive_multiplier : negative_multiplier;
-        int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
+        int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
         output_value = std::min<int32_t>(output_value, std::numeric_limits<int8_t>::max());
         output_value = std::max<int32_t>(output_value, std::numeric_limits<int8_t>::min());
         output_ref[i] = static_cast<int8_t>(output_value);
@@ -158,7 +158,7 @@ class VLReLUMicrokernelTester {
       for (size_t i = 0; i < batch_size(); i++) {
         const int32_t input_value = (input_zero_point() - input[i]) << 7;
         const int32_t multiplier = input_value <= 0 ? positive_multiplier : negative_multiplier;
-        int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
+        int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point();
         output_value = std::min<int32_t>(output_value, std::numeric_limits<uint8_t>::max());
         output_value = std::max<int32_t>(output_value, std::numeric_limits<uint8_t>::min());
         output_ref[i] = static_cast<uint8_t>(output_value);
author	Marat Dukhan <maratek@google.com>	2022-07-25 11:12:42 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-07-25 11:14:01 -0700
commit	2247560904f5366d6d370bb080cfc2dbe9f57598 (patch)
tree	447a97d6d3f25016ab6b6cd5c3f7a30a8a188795
parent	c7cb3c177fbcd277c29d0ead9eaf1390255591f0 (diff)
download	XNNPACK-2247560904f5366d6d370bb080cfc2dbe9f57598.tar.gz