diff options
author | Marat Dukhan <maratek@google.com> | 2022-07-25 11:12:42 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2022-07-25 11:14:01 -0700 |
commit | 2247560904f5366d6d370bb080cfc2dbe9f57598 (patch) | |
tree | 447a97d6d3f25016ab6b6cd5c3f7a30a8a188795 | |
parent | c7cb3c177fbcd277c29d0ead9eaf1390255591f0 (diff) | |
download | XNNPACK-2247560904f5366d6d370bb080cfc2dbe9f57598.tar.gz |
Rename asr_s32/asr_s64 to math_asr_s32/math_asr_s64
PiperOrigin-RevId: 463131507
69 files changed, 305 insertions, 305 deletions
diff --git a/src/qs8-requantization/gemmlowp-scalar.c b/src/qs8-requantization/gemmlowp-scalar.c index 119e81edb..3bd4f42f1 100644 --- a/src/qs8-requantization/gemmlowp-scalar.c +++ b/src/qs8-requantization/gemmlowp-scalar.c @@ -100,10 +100,10 @@ void xnn_qs8_requantize_gemmlowp__scalar( const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t) (z_q31product < 0); const int32_t w_remainder = (w_q31product & remainder_mask) - (int32_t) (w_q31product < 0); - const int32_t x_scaled = asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold); - const int32_t y_scaled = asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold); - const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold); - const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold); + const int32_t x_scaled = math_asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold); + const int32_t y_scaled = math_asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold); + const int32_t z_scaled = math_asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold); + const int32_t w_scaled = math_asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold); // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); diff --git a/src/qs8-requantization/rndna-scalar-signed64.c b/src/qs8-requantization/rndna-scalar-signed64.c index d04dc795a..e3d3d29db 100644 --- a/src/qs8-requantization/rndna-scalar-signed64.c +++ b/src/qs8-requantization/rndna-scalar-signed64.c @@ -65,10 +65,10 @@ void xnn_qs8_requantize_rndna__scalar_signed64( // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit // "right shift with rounding" instruction each line below can be represented by just one such instruction // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD). - const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift); - const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift); - const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift); - const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift); + const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift); + const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift); + const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift); + const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift); // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); diff --git a/src/qs8-requantization/rndnu-scalar.c b/src/qs8-requantization/rndnu-scalar.c index eafc7e768..84df119be 100644 --- a/src/qs8-requantization/rndnu-scalar.c +++ b/src/qs8-requantization/rndnu-scalar.c @@ -59,10 +59,10 @@ void xnn_qs8_requantize_rndnu__scalar( // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit // "right shift with rounding" instruction each line below can be represented by just one such instruction // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD). - const int32_t x_scaled = (int32_t) asr_s64(x_product + rounding, shift); - const int32_t y_scaled = (int32_t) asr_s64(y_product + rounding, shift); - const int32_t z_scaled = (int32_t) asr_s64(z_product + rounding, shift); - const int32_t w_scaled = (int32_t) asr_s64(w_product + rounding, shift); + const int32_t x_scaled = (int32_t) math_asr_s64(x_product + rounding, shift); + const int32_t y_scaled = (int32_t) math_asr_s64(y_product + rounding, shift); + const int32_t z_scaled = (int32_t) math_asr_s64(z_product + rounding, shift); + const int32_t w_scaled = (int32_t) math_asr_s64(w_product + rounding, shift); // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); diff --git a/src/qs8-vadd/gen/minmax-scalar-x1.c b/src/qs8-vadd/gen/minmax-scalar-x1.c index e390b7ae4..7cd7b2c7a 100644 --- a/src/qs8-vadd/gen/minmax-scalar-x1.c +++ b/src/qs8-vadd/gen/minmax-scalar-x1.c @@ -33,7 +33,7 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x1( const int32_t vb = *input_b++; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (int8_t) (vout + voutput_zero_point); diff --git a/src/qs8-vadd/gen/minmax-scalar-x2.c b/src/qs8-vadd/gen/minmax-scalar-x2.c index bb55d42b3..fac2cee4c 100644 --- a/src/qs8-vadd/gen/minmax-scalar-x2.c +++ b/src/qs8-vadd/gen/minmax-scalar-x2.c @@ -42,8 +42,8 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x2( vacc0 += vb0 * vb_multiplier; vacc1 += vb1 * vb_multiplier; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -63,7 +63,7 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x2( const int32_t vb = *input_b; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (int8_t) (vout + voutput_zero_point); diff --git a/src/qs8-vadd/gen/minmax-scalar-x4.c b/src/qs8-vadd/gen/minmax-scalar-x4.c index cb3e69590..dc613a7bb 100644 --- a/src/qs8-vadd/gen/minmax-scalar-x4.c +++ b/src/qs8-vadd/gen/minmax-scalar-x4.c @@ -50,10 +50,10 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x4( vacc2 += vb2 * vb_multiplier; vacc3 += vb3 * vb_multiplier; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); - int32_t vout2 = asr_s32(vacc2, vshift); - int32_t vout3 = asr_s32(vacc3, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); + int32_t vout2 = math_asr_s32(vacc2, vshift); + int32_t vout3 = math_asr_s32(vacc3, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -82,7 +82,7 @@ void xnn_qs8_vadd_minmax_ukernel__scalar_x4( const int32_t vb = *input_b++; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (int8_t) (vout + voutput_zero_point); diff --git a/src/qs8-vadd/scalar.c.in b/src/qs8-vadd/scalar.c.in index 65ff4aa77..3cd4b8dcd 100644 --- a/src/qs8-vadd/scalar.c.in +++ b/src/qs8-vadd/scalar.c.in @@ -33,7 +33,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}( const int32_t vb = *input_b++; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (${XINT8_T}) (vout + voutput_zero_point); @@ -55,7 +55,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}( vacc${N} += vb${N} * vb_multiplier; $for N in range(BATCH_TILE): - int32_t vout${N} = asr_s32(vacc${N}, vshift); + int32_t vout${N} = math_asr_s32(vacc${N}, vshift); $for N in range(BATCH_TILE): vout${N} = math_max_s32(vout${N}, voutput_min_less_zero_point); @@ -76,7 +76,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}( const int32_t vb = *input_b; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (${XINT8_T}) (vout + voutput_zero_point); @@ -86,7 +86,7 @@ void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__scalar_x${BATCH_TILE}( const int32_t vb = *input_b++; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (${XINT8_T}) (vout + voutput_zero_point); diff --git a/src/qs8-vaddc/gen/minmax-scalar-x1.c b/src/qs8-vaddc/gen/minmax-scalar-x1.c index f7dc69be4..34597dae2 100644 --- a/src/qs8-vaddc/gen/minmax-scalar-x1.c +++ b/src/qs8-vaddc/gen/minmax-scalar-x1.c @@ -31,7 +31,7 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x1( const int32_t va = *input_a++; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (int8_t) (vout + voutput_zero_point); diff --git a/src/qs8-vaddc/gen/minmax-scalar-x2.c b/src/qs8-vaddc/gen/minmax-scalar-x2.c index 75b733a8e..cfc8f8ff6 100644 --- a/src/qs8-vaddc/gen/minmax-scalar-x2.c +++ b/src/qs8-vaddc/gen/minmax-scalar-x2.c @@ -36,8 +36,8 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x2( const int32_t vacc1 = vbias + va1 * va_multiplier; input_b += 2; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -56,7 +56,7 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x2( const int32_t va = *input_a; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (int8_t) (vout + voutput_zero_point); diff --git a/src/qs8-vaddc/gen/minmax-scalar-x4.c b/src/qs8-vaddc/gen/minmax-scalar-x4.c index ee15c3d10..ed1124561 100644 --- a/src/qs8-vaddc/gen/minmax-scalar-x4.c +++ b/src/qs8-vaddc/gen/minmax-scalar-x4.c @@ -40,10 +40,10 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x4( const int32_t vacc3 = vbias + va3 * va_multiplier; input_b += 4; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); - int32_t vout2 = asr_s32(vacc2, vshift); - int32_t vout3 = asr_s32(vacc3, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); + int32_t vout2 = math_asr_s32(vacc2, vshift); + int32_t vout3 = math_asr_s32(vacc3, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -71,7 +71,7 @@ void xnn_qs8_vaddc_minmax_ukernel__scalar_x4( const int32_t va = *input_a++; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (int8_t) (vout + voutput_zero_point); diff --git a/src/qs8-vaddc/scalar.c.in b/src/qs8-vaddc/scalar.c.in index 61109b508..3616ad3c1 100644 --- a/src/qs8-vaddc/scalar.c.in +++ b/src/qs8-vaddc/scalar.c.in @@ -31,7 +31,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}( const int32_t va = *input_a++; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (${XINT8_T}) (vout + voutput_zero_point); @@ -49,7 +49,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}( input_b += ${BATCH_TILE}; $for N in range(BATCH_TILE): - int32_t vout${N} = asr_s32(vacc${N}, vshift); + int32_t vout${N} = math_asr_s32(vacc${N}, vshift); $for N in range(BATCH_TILE): vout${N} = math_max_s32(vout${N}, voutput_min_less_zero_point); @@ -69,7 +69,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}( const int32_t va = *input_a; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (${XINT8_T}) (vout + voutput_zero_point); @@ -78,7 +78,7 @@ void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__scalar_x${BATCH_TILE}( const int32_t va = *input_a++; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (${XINT8_T}) (vout + voutput_zero_point); diff --git a/src/qs8-vcvt/armv6simd.c.in b/src/qs8-vcvt/armv6simd.c.in index 4dd21bee7..ebfffae98 100644 --- a/src/qs8-vcvt/armv6simd.c.in +++ b/src/qs8-vcvt/armv6simd.c.in @@ -48,7 +48,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__armv6simd_x${BATCH_TILE}( int32_t vacc${ABC[N+3]} = __smlawt(vmultiplier, vx${ABC[N+1]}${ABC[N+3]}, vbias); $for N in range(BATCH_TILE): - vacc${ABC[N]} = ${__XSAT}(asr_s32(vacc${ABC[N]}, 1), 8); + vacc${ABC[N]} = ${__XSAT}(math_asr_s32(vacc${ABC[N]}, 1), 8); $for N in range(BATCH_TILE): y[${N}] = (${XINT8_T}) vacc${ABC[N]}; @@ -66,10 +66,10 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__armv6simd_x${BATCH_TILE}( int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); - vacc0 = ${__XSAT}(asr_s32(vacc0, 1), 8); - vacc1 = ${__XSAT}(asr_s32(vacc1, 1), 8); - vacc2 = ${__XSAT}(asr_s32(vacc2, 1), 8); - vacc3 = ${__XSAT}(asr_s32(vacc3, 1), 8); + vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8); + vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8); + vacc2 = ${__XSAT}(math_asr_s32(vacc2, 1), 8); + vacc3 = ${__XSAT}(math_asr_s32(vacc3, 1), 8); y[0] = (${XINT8_T}) vacc0; y[1] = (${XINT8_T}) vacc1; @@ -87,13 +87,13 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__armv6simd_x${BATCH_TILE}( int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); - vacc0 = ${__XSAT}(asr_s32(vacc0, 1), 8); - vacc1 = ${__XSAT}(asr_s32(vacc1, 1), 8); + vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8); + vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8); if (n & (2 * sizeof(${XINT8_T}))) { y[0] = (${XINT8_T}) vacc0; y[1] = (${XINT8_T}) vacc1; - vacc0 = ${__XSAT}(asr_s32(vacc2, 1), 8); + vacc0 = ${__XSAT}(math_asr_s32(vacc2, 1), 8); y += 2; } if (n & (1 * sizeof(${XINT8_T}))) { diff --git a/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c b/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c index a7eaee4e4..ea76f2893 100644 --- a/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c +++ b/src/qs8-vcvt/gen/vcvt-armv6simd-x4.c @@ -38,10 +38,10 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x4( int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); - vacc0 = __ssat(asr_s32(vacc0, 1), 8); - vacc1 = __ssat(asr_s32(vacc1, 1), 8); - vacc2 = __ssat(asr_s32(vacc2, 1), 8); - vacc3 = __ssat(asr_s32(vacc3, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 1), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 1), 8); + vacc2 = __ssat(math_asr_s32(vacc2, 1), 8); + vacc3 = __ssat(math_asr_s32(vacc3, 1), 8); y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; @@ -59,13 +59,13 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x4( int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); - vacc0 = __ssat(asr_s32(vacc0, 1), 8); - vacc1 = __ssat(asr_s32(vacc1, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 1), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 1), 8); if (n & (2 * sizeof(int8_t))) { y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; - vacc0 = __ssat(asr_s32(vacc2, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc2, 1), 8); y += 2; } if (n & (1 * sizeof(int8_t))) { diff --git a/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c b/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c index 9137dc452..9b3073492 100644 --- a/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c +++ b/src/qs8-vcvt/gen/vcvt-armv6simd-x8.c @@ -45,14 +45,14 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x8( int32_t vacc6 = __smlawt(vmultiplier, vx46, vbias); int32_t vacc7 = __smlawt(vmultiplier, vx57, vbias); - vacc0 = __ssat(asr_s32(vacc0, 1), 8); - vacc1 = __ssat(asr_s32(vacc1, 1), 8); - vacc2 = __ssat(asr_s32(vacc2, 1), 8); - vacc3 = __ssat(asr_s32(vacc3, 1), 8); - vacc4 = __ssat(asr_s32(vacc4, 1), 8); - vacc5 = __ssat(asr_s32(vacc5, 1), 8); - vacc6 = __ssat(asr_s32(vacc6, 1), 8); - vacc7 = __ssat(asr_s32(vacc7, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 1), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 1), 8); + vacc2 = __ssat(math_asr_s32(vacc2, 1), 8); + vacc3 = __ssat(math_asr_s32(vacc3, 1), 8); + vacc4 = __ssat(math_asr_s32(vacc4, 1), 8); + vacc5 = __ssat(math_asr_s32(vacc5, 1), 8); + vacc6 = __ssat(math_asr_s32(vacc6, 1), 8); + vacc7 = __ssat(math_asr_s32(vacc7, 1), 8); y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; @@ -76,10 +76,10 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x8( int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); - vacc0 = __ssat(asr_s32(vacc0, 1), 8); - vacc1 = __ssat(asr_s32(vacc1, 1), 8); - vacc2 = __ssat(asr_s32(vacc2, 1), 8); - vacc3 = __ssat(asr_s32(vacc3, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 1), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 1), 8); + vacc2 = __ssat(math_asr_s32(vacc2, 1), 8); + vacc3 = __ssat(math_asr_s32(vacc3, 1), 8); y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; @@ -97,13 +97,13 @@ void xnn_qs8_vcvt_ukernel__armv6simd_x8( int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); - vacc0 = __ssat(asr_s32(vacc0, 1), 8); - vacc1 = __ssat(asr_s32(vacc1, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 1), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 1), 8); if (n & (2 * sizeof(int8_t))) { y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; - vacc0 = __ssat(asr_s32(vacc2, 1), 8); + vacc0 = __ssat(math_asr_s32(vacc2, 1), 8); y += 2; } if (n & (1 * sizeof(int8_t))) { diff --git a/src/qs8-vcvt/gen/vcvt-scalar-x1.c b/src/qs8-vcvt/gen/vcvt-scalar-x1.c index 211188be4..9424bb751 100644 --- a/src/qs8-vcvt/gen/vcvt-scalar-x1.c +++ b/src/qs8-vcvt/gen/vcvt-scalar-x1.c @@ -25,7 +25,7 @@ void xnn_qs8_vcvt_ukernel__scalar_x1( int32_t vacc = *x++; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y++ = (int8_t) vout; diff --git a/src/qs8-vcvt/gen/vcvt-scalar-x2.c b/src/qs8-vcvt/gen/vcvt-scalar-x2.c index 93b0327d5..5501229c0 100644 --- a/src/qs8-vcvt/gen/vcvt-scalar-x2.c +++ b/src/qs8-vcvt/gen/vcvt-scalar-x2.c @@ -29,8 +29,8 @@ void xnn_qs8_vcvt_ukernel__scalar_x2( vacc0 = vbias + vacc0 * vmultiplier; vacc1 = vbias + vacc1 * vmultiplier; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); vout0 = math_max_s32(vout0, -128); vout1 = math_max_s32(vout1, -128); @@ -46,7 +46,7 @@ void xnn_qs8_vcvt_ukernel__scalar_x2( int32_t vacc = *x; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y = (int8_t) vout; diff --git a/src/qs8-vcvt/gen/vcvt-scalar-x4.c b/src/qs8-vcvt/gen/vcvt-scalar-x4.c index db786c92b..44c2ff627 100644 --- a/src/qs8-vcvt/gen/vcvt-scalar-x4.c +++ b/src/qs8-vcvt/gen/vcvt-scalar-x4.c @@ -33,10 +33,10 @@ void xnn_qs8_vcvt_ukernel__scalar_x4( vacc2 = vbias + vacc2 * vmultiplier; vacc3 = vbias + vacc3 * vmultiplier; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); - int32_t vout2 = asr_s32(vacc2, 8); - int32_t vout3 = asr_s32(vacc3, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); + int32_t vout2 = math_asr_s32(vacc2, 8); + int32_t vout3 = math_asr_s32(vacc3, 8); vout0 = math_max_s32(vout0, -128); vout1 = math_max_s32(vout1, -128); @@ -59,7 +59,7 @@ void xnn_qs8_vcvt_ukernel__scalar_x4( int32_t vacc = *x++; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y++ = (int8_t) vout; diff --git a/src/qs8-vcvt/scalar.c.in b/src/qs8-vcvt/scalar.c.in index 23270f573..284876e8c 100644 --- a/src/qs8-vcvt/scalar.c.in +++ b/src/qs8-vcvt/scalar.c.in @@ -28,7 +28,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}( int32_t vacc = *x++; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y++ = (${XINT8_T}) vout; @@ -45,7 +45,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}( vacc${ABC[N]} = vbias + vacc${ABC[N]} * vmultiplier; $for N in range(BATCH_TILE): - int32_t vout${ABC[N]} = asr_s32(vacc${ABC[N]}, 8); + int32_t vout${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 8); $for N in range(BATCH_TILE): vout${ABC[N]} = math_max_s32(vout${ABC[N]}, ${OUTPUT_MIN}); @@ -62,7 +62,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}( int32_t vacc = *x; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y = (${XINT8_T}) vout; @@ -71,7 +71,7 @@ void xnn_${DATATYPE.lower()}_vcvt_ukernel__scalar_x${BATCH_TILE}( int32_t vacc = *x++; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y++ = (${XINT8_T}) vout; diff --git a/src/qs8-vlrelu/armv6simd.c.in b/src/qs8-vlrelu/armv6simd.c.in index d9f3d07ac..302e0c3fc 100644 --- a/src/qs8-vlrelu/armv6simd.c.in +++ b/src/qs8-vlrelu/armv6simd.c.in @@ -56,7 +56,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__armv6simd_x${BATCH_TILE}( int32_t vacc${ABC[N+3]} = __smlatt(vmultiplier${ABC[N+1]}${ABC[N+3]}, vx${ABC[N+1]}${ABC[N+3]}, vbias); $for N in range(BATCH_TILE): - vacc${ABC[N]} = ${__XSAT}(asr_s32(vacc${ABC[N]}, 8), 8); + vacc${ABC[N]} = ${__XSAT}(math_asr_s32(vacc${ABC[N]}, 8), 8); $for N in range(BATCH_TILE): y[${N}] = (${XINT8_T}) vacc${ABC[N]}; @@ -79,10 +79,10 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__armv6simd_x${BATCH_TILE}( int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias); - vacc0 = ${__XSAT}(asr_s32(vacc0, 8), 8); - vacc1 = ${__XSAT}(asr_s32(vacc1, 8), 8); - vacc2 = ${__XSAT}(asr_s32(vacc2, 8), 8); - vacc3 = ${__XSAT}(asr_s32(vacc3, 8), 8); + vacc0 = ${__XSAT}(math_asr_s32(vacc0, 8), 8); + vacc1 = ${__XSAT}(math_asr_s32(vacc1, 8), 8); + vacc2 = ${__XSAT}(math_asr_s32(vacc2, 8), 8); + vacc3 = ${__XSAT}(math_asr_s32(vacc3, 8), 8); y[0] = (${XINT8_T}) vacc0; y[1] = (${XINT8_T}) vacc1; @@ -105,13 +105,13 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__armv6simd_x${BATCH_TILE}( int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias); const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); - vacc0 = ${__XSAT}(asr_s32(vacc0, 8), 8); - vacc1 = ${__XSAT}(asr_s32(vacc1, 8), 8); + vacc0 = ${__XSAT}(math_asr_s32(vacc0, 8), 8); + vacc1 = ${__XSAT}(math_asr_s32(vacc1, 8), 8); if (n & (2 * sizeof(${XINT8_T}))) { y[0] = (${XINT8_T}) vacc0; y[1] = (${XINT8_T}) vacc1; - vacc0 = ${__XSAT}(asr_s32(vacc2, 8), 8); + vacc0 = ${__XSAT}(math_asr_s32(vacc2, 8), 8); y += 2; } if (n & (1 * sizeof(${XINT8_T}))) { diff --git a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c index d99ad2e45..aa298b9b7 100644 --- a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c +++ b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x4.c @@ -44,10 +44,10 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x4( int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias); - vacc0 = __ssat(asr_s32(vacc0, 8), 8); - vacc1 = __ssat(asr_s32(vacc1, 8), 8); - vacc2 = __ssat(asr_s32(vacc2, 8), 8); - vacc3 = __ssat(asr_s32(vacc3, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 8), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 8), 8); + vacc2 = __ssat(math_asr_s32(vacc2, 8), 8); + vacc3 = __ssat(math_asr_s32(vacc3, 8), 8); y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; @@ -70,13 +70,13 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x4( int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias); const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); - vacc0 = __ssat(asr_s32(vacc0, 8), 8); - vacc1 = __ssat(asr_s32(vacc1, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 8), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 8), 8); if (n & (2 * sizeof(int8_t))) { y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; - vacc0 = __ssat(asr_s32(vacc2, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc2, 8), 8); y += 2; } if (n & (1 * sizeof(int8_t))) { diff --git a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c index 1d4e233ac..053e92ace 100644 --- a/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c +++ b/src/qs8-vlrelu/gen/vlrelu-armv6simd-x8.c @@ -55,14 +55,14 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x8( int32_t vacc6 = __smlatt(vmultiplier46, vx46, vbias); int32_t vacc7 = __smlatt(vmultiplier57, vx57, vbias); - vacc0 = __ssat(asr_s32(vacc0, 8), 8); - vacc1 = __ssat(asr_s32(vacc1, 8), 8); - vacc2 = __ssat(asr_s32(vacc2, 8), 8); - vacc3 = __ssat(asr_s32(vacc3, 8), 8); - vacc4 = __ssat(asr_s32(vacc4, 8), 8); - vacc5 = __ssat(asr_s32(vacc5, 8), 8); - vacc6 = __ssat(asr_s32(vacc6, 8), 8); - vacc7 = __ssat(asr_s32(vacc7, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 8), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 8), 8); + vacc2 = __ssat(math_asr_s32(vacc2, 8), 8); + vacc3 = __ssat(math_asr_s32(vacc3, 8), 8); + vacc4 = __ssat(math_asr_s32(vacc4, 8), 8); + vacc5 = __ssat(math_asr_s32(vacc5, 8), 8); + vacc6 = __ssat(math_asr_s32(vacc6, 8), 8); + vacc7 = __ssat(math_asr_s32(vacc7, 8), 8); y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; @@ -91,10 +91,10 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x8( int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias); - vacc0 = __ssat(asr_s32(vacc0, 8), 8); - vacc1 = __ssat(asr_s32(vacc1, 8), 8); - vacc2 = __ssat(asr_s32(vacc2, 8), 8); - vacc3 = __ssat(asr_s32(vacc3, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 8), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 8), 8); + vacc2 = __ssat(math_asr_s32(vacc2, 8), 8); + vacc3 = __ssat(math_asr_s32(vacc3, 8), 8); y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; @@ -117,13 +117,13 @@ void xnn_qs8_vlrelu_ukernel__armv6simd_x8( int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias); const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); - vacc0 = __ssat(asr_s32(vacc0, 8), 8); - vacc1 = __ssat(asr_s32(vacc1, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc0, 8), 8); + vacc1 = __ssat(math_asr_s32(vacc1, 8), 8); if (n & (2 * sizeof(int8_t))) { y[0] = (int8_t) vacc0; y[1] = (int8_t) vacc1; - vacc0 = __ssat(asr_s32(vacc2, 8), 8); + vacc0 = __ssat(math_asr_s32(vacc2, 8), 8); y += 2; } if (n & (1 * sizeof(int8_t))) { diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c index e5d8fd059..bd790691a 100644 --- a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c +++ b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x1.c @@ -25,10 +25,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x1( const int32_t vbias = params->scalar_andxor.bias; do { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y++ = (int8_t) vout; diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c index 5d52c2724..7e1f97862 100644 --- a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c +++ b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x2.c @@ -31,8 +31,8 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x2( vacc0 -= vinput_zero_point; vacc1 -= vinput_zero_point; - int32_t vmultiplier0 = asr_s32(vacc0, 31); - int32_t vmultiplier1 = asr_s32(vacc1, 31); + int32_t vmultiplier0 = math_asr_s32(vacc0, 31); + int32_t vmultiplier1 = math_asr_s32(vacc1, 31); vmultiplier0 &= vmultiplier_diff; vmultiplier1 &= vmultiplier_diff; @@ -43,8 +43,8 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x2( vacc0 = vbias + vacc0 * vmultiplier0; vacc1 = vbias + vacc1 * vmultiplier1; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); vout0 = math_max_s32(vout0, -128); vout1 = math_max_s32(vout1, -128); @@ -58,10 +58,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x2( } if XNN_UNLIKELY(n != 0) { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y = (int8_t) vout; diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c index 00043810f..159482303 100644 --- a/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c +++ b/src/qs8-vlrelu/gen/vlrelu-scalar-andxor-x4.c @@ -35,10 +35,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x4( vacc2 -= vinput_zero_point; vacc3 -= vinput_zero_point; - int32_t vmultiplier0 = asr_s32(vacc0, 31); - int32_t vmultiplier1 = asr_s32(vacc1, 31); - int32_t vmultiplier2 = asr_s32(vacc2, 31); - int32_t vmultiplier3 = asr_s32(vacc3, 31); + int32_t vmultiplier0 = math_asr_s32(vacc0, 31); + int32_t vmultiplier1 = math_asr_s32(vacc1, 31); + int32_t vmultiplier2 = math_asr_s32(vacc2, 31); + int32_t vmultiplier3 = math_asr_s32(vacc3, 31); vmultiplier0 &= vmultiplier_diff; vmultiplier1 &= vmultiplier_diff; @@ -55,10 +55,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x4( vacc2 = vbias + vacc2 * vmultiplier2; vacc3 = vbias + vacc3 * vmultiplier3; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); - int32_t vout2 = asr_s32(vacc2, 8); - int32_t vout3 = asr_s32(vacc3, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); + int32_t vout2 = math_asr_s32(vacc2, 8); + int32_t vout3 = math_asr_s32(vacc3, 8); vout0 = math_max_s32(vout0, -128); vout1 = math_max_s32(vout1, -128); @@ -79,10 +79,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_andxor_x4( if XNN_UNLIKELY(n != 0) { do { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y++ = (int8_t) vout; diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c index 59f41c68e..ed0ad7235 100644 --- a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c +++ b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c @@ -28,7 +28,7 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x1( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y++ = (int8_t) vout; diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c index ffac12f0a..9c9925de0 100644 --- a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c +++ b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x2.c @@ -37,8 +37,8 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x2( vacc0 = vbias + vacc0 * vmultiplier0; vacc1 = vbias + vacc1 * vmultiplier1; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); vout0 = math_max_s32(vout0, -128); vout1 = math_max_s32(vout1, -128); @@ -55,7 +55,7 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x2( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y = (int8_t) vout; diff --git a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c index 3e9d93412..480febd29 100644 --- a/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c +++ b/src/qs8-vlrelu/gen/vlrelu-scalar-select-x4.c @@ -45,10 +45,10 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x4( vacc2 = vbias + vacc2 * vmultiplier2; vacc3 = vbias + vacc3 * vmultiplier3; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); - int32_t vout2 = asr_s32(vacc2, 8); - int32_t vout3 = asr_s32(vacc3, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); + int32_t vout2 = math_asr_s32(vacc2, 8); + int32_t vout3 = math_asr_s32(vacc3, 8); vout0 = math_max_s32(vout0, -128); vout1 = math_max_s32(vout1, -128); @@ -72,7 +72,7 @@ void xnn_qs8_vlrelu_ukernel__scalar_select_x4( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, -128); vout = math_min_s32(vout, 127); *y++ = (int8_t) vout; diff --git a/src/qs8-vlrelu/scalar-andxor.c.in b/src/qs8-vlrelu/scalar-andxor.c.in index 36b396d86..54083d7b6 100644 --- a/src/qs8-vlrelu/scalar-andxor.c.in +++ b/src/qs8-vlrelu/scalar-andxor.c.in @@ -28,10 +28,10 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}( $if BATCH_TILE == 1: do { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y++ = (${XINT8_T}) vout; @@ -48,7 +48,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}( vacc${ABC[N]} -= vinput_zero_point; $for N in range(BATCH_TILE): - int32_t vmultiplier${ABC[N]} = asr_s32(vacc${ABC[N]}, 31); + int32_t vmultiplier${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 31); $for N in range(BATCH_TILE): vmultiplier${ABC[N]} &= vmultiplier_diff; @@ -60,7 +60,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}( vacc${ABC[N]} = vbias + vacc${ABC[N]} * vmultiplier${ABC[N]}; $for N in range(BATCH_TILE): - int32_t vout${ABC[N]} = asr_s32(vacc${ABC[N]}, 8); + int32_t vout${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 8); $for N in range(BATCH_TILE): vout${ABC[N]} = math_max_s32(vout${ABC[N]}, ${OUTPUT_MIN}); @@ -75,20 +75,20 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_andxor_x${BATCH_TILE}( if XNN_UNLIKELY(n != 0) { $if BATCH_TILE == 2: int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y = (${XINT8_T}) vout; $else: do { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y++ = (${XINT8_T}) vout; diff --git a/src/qs8-vlrelu/scalar-select.c.in b/src/qs8-vlrelu/scalar-select.c.in index 83723566f..44dc0ada6 100644 --- a/src/qs8-vlrelu/scalar-select.c.in +++ b/src/qs8-vlrelu/scalar-select.c.in @@ -31,7 +31,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y++ = (${XINT8_T}) vout; @@ -54,7 +54,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}( vacc${ABC[N]} = vbias + vacc${ABC[N]} * vmultiplier${ABC[N]}; $for N in range(BATCH_TILE): - int32_t vout${ABC[N]} = asr_s32(vacc${ABC[N]}, 8); + int32_t vout${ABC[N]} = math_asr_s32(vacc${ABC[N]}, 8); $for N in range(BATCH_TILE): vout${ABC[N]} = math_max_s32(vout${ABC[N]}, ${OUTPUT_MIN}); @@ -72,7 +72,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y = (${XINT8_T}) vout; @@ -82,7 +82,7 @@ void xnn_${DATATYPE.lower()}_vlrelu_ukernel__scalar_select_x${BATCH_TILE}( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, ${OUTPUT_MIN}); vout = math_min_s32(vout, ${OUTPUT_MAX}); *y++ = (${XINT8_T}) vout; diff --git a/src/qu8-avgpool/9p8x-minmax-scalar-c1.c b/src/qu8-avgpool/9p8x-minmax-scalar-c1.c index feeb859fc..dab8b0111 100644 --- a/src/qu8-avgpool/9p8x-minmax-scalar-c1.c +++ b/src/qu8-avgpool/9p8x-minmax-scalar-c1.c @@ -272,7 +272,7 @@ void xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1( const int64_t vproduct = (int64_t) vacc * (int64_t) vmultiplier; const int64_t vadjusted_product = vproduct - (int64_t) (vacc < 0); - int32_t vout = (int32_t) asr_s64(vadjusted_product + vrounding, vshift); + int32_t vout = (int32_t) math_asr_s64(vadjusted_product + vrounding, vshift); vout = vout < voutput_min ? voutput_min : vout; vout = vout > voutput_max ? voutput_max : vout; vout += voutput_zero_point; diff --git a/src/qu8-avgpool/9x-minmax-scalar-c1.c b/src/qu8-avgpool/9x-minmax-scalar-c1.c index ca66f5cad..127e57226 100644 --- a/src/qu8-avgpool/9x-minmax-scalar-c1.c +++ b/src/qu8-avgpool/9x-minmax-scalar-c1.c @@ -133,7 +133,7 @@ void xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1( const int64_t vproduct = (int64_t) vacc * (int64_t) vmultiplier; const int64_t vadjusted_product = vproduct - (int64_t) (vacc < 0); - int32_t vout = (int32_t) asr_s64(vadjusted_product + vrounding, vshift); + int32_t vout = (int32_t) math_asr_s64(vadjusted_product + vrounding, vshift); vout = vout < voutput_min ? voutput_min : vout; vout = vout > voutput_max ? voutput_max : vout; vout += voutput_zero_point; diff --git a/src/qu8-requantization/gemmlowp-scalar.c b/src/qu8-requantization/gemmlowp-scalar.c index 8d6ea0967..ecbbe2647 100644 --- a/src/qu8-requantization/gemmlowp-scalar.c +++ b/src/qu8-requantization/gemmlowp-scalar.c @@ -100,10 +100,10 @@ void xnn_qu8_requantize_gemmlowp__scalar( const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t) (z_q31product < 0); const int32_t w_remainder = (w_q31product & remainder_mask) - (int32_t) (w_q31product < 0); - const int32_t x_scaled = asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold); - const int32_t y_scaled = asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold); - const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold); - const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold); + const int32_t x_scaled = math_asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold); + const int32_t y_scaled = math_asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold); + const int32_t z_scaled = math_asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold); + const int32_t w_scaled = math_asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold); // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); diff --git a/src/qu8-requantization/rndna-scalar-signed64.c b/src/qu8-requantization/rndna-scalar-signed64.c index 904cf90df..e70c1f2eb 100644 --- a/src/qu8-requantization/rndna-scalar-signed64.c +++ b/src/qu8-requantization/rndna-scalar-signed64.c @@ -65,10 +65,10 @@ void xnn_qu8_requantize_rndna__scalar_signed64( // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit // "right shift with rounding" instruction each line below can be represented by just one such instruction // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD). - const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift); - const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift); - const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift); - const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift); + const int32_t x_scaled = (int32_t) math_asr_s64(x_adjusted_product + rounding, shift); + const int32_t y_scaled = (int32_t) math_asr_s64(y_adjusted_product + rounding, shift); + const int32_t z_scaled = (int32_t) math_asr_s64(z_adjusted_product + rounding, shift); + const int32_t w_scaled = (int32_t) math_asr_s64(w_adjusted_product + rounding, shift); // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point). const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax); diff --git a/src/qu8-vadd/gen/minmax-scalar-x1.c b/src/qu8-vadd/gen/minmax-scalar-x1.c index 79e3ee526..3fb515adf 100644 --- a/src/qu8-vadd/gen/minmax-scalar-x1.c +++ b/src/qu8-vadd/gen/minmax-scalar-x1.c @@ -33,7 +33,7 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x1( const int32_t vb = *input_b++; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (uint8_t) (vout + voutput_zero_point); diff --git a/src/qu8-vadd/gen/minmax-scalar-x2.c b/src/qu8-vadd/gen/minmax-scalar-x2.c index 05203dcb7..f65227db2 100644 --- a/src/qu8-vadd/gen/minmax-scalar-x2.c +++ b/src/qu8-vadd/gen/minmax-scalar-x2.c @@ -42,8 +42,8 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x2( vacc0 += vb0 * vb_multiplier; vacc1 += vb1 * vb_multiplier; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -63,7 +63,7 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x2( const int32_t vb = *input_b; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (uint8_t) (vout + voutput_zero_point); diff --git a/src/qu8-vadd/gen/minmax-scalar-x4.c b/src/qu8-vadd/gen/minmax-scalar-x4.c index f433225f0..95e4148c2 100644 --- a/src/qu8-vadd/gen/minmax-scalar-x4.c +++ b/src/qu8-vadd/gen/minmax-scalar-x4.c @@ -50,10 +50,10 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x4( vacc2 += vb2 * vb_multiplier; vacc3 += vb3 * vb_multiplier; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); - int32_t vout2 = asr_s32(vacc2, vshift); - int32_t vout3 = asr_s32(vacc3, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); + int32_t vout2 = math_asr_s32(vacc2, vshift); + int32_t vout3 = math_asr_s32(vacc3, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -82,7 +82,7 @@ void xnn_qu8_vadd_minmax_ukernel__scalar_x4( const int32_t vb = *input_b++; const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (uint8_t) (vout + voutput_zero_point); diff --git a/src/qu8-vaddc/gen/minmax-scalar-x1.c b/src/qu8-vaddc/gen/minmax-scalar-x1.c index 096b2fc7f..22502c59c 100644 --- a/src/qu8-vaddc/gen/minmax-scalar-x1.c +++ b/src/qu8-vaddc/gen/minmax-scalar-x1.c @@ -31,7 +31,7 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x1( const int32_t va = *input_a++; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (uint8_t) (vout + voutput_zero_point); diff --git a/src/qu8-vaddc/gen/minmax-scalar-x2.c b/src/qu8-vaddc/gen/minmax-scalar-x2.c index ff775bf14..015532884 100644 --- a/src/qu8-vaddc/gen/minmax-scalar-x2.c +++ b/src/qu8-vaddc/gen/minmax-scalar-x2.c @@ -36,8 +36,8 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x2( const int32_t vacc1 = vbias + va1 * va_multiplier; input_b += 2; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -56,7 +56,7 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x2( const int32_t va = *input_a; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (uint8_t) (vout + voutput_zero_point); diff --git a/src/qu8-vaddc/gen/minmax-scalar-x4.c b/src/qu8-vaddc/gen/minmax-scalar-x4.c index d77b74f0c..fc6fce367 100644 --- a/src/qu8-vaddc/gen/minmax-scalar-x4.c +++ b/src/qu8-vaddc/gen/minmax-scalar-x4.c @@ -40,10 +40,10 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x4( const int32_t vacc3 = vbias + va3 * va_multiplier; input_b += 4; - int32_t vout0 = asr_s32(vacc0, vshift); - int32_t vout1 = asr_s32(vacc1, vshift); - int32_t vout2 = asr_s32(vacc2, vshift); - int32_t vout3 = asr_s32(vacc3, vshift); + int32_t vout0 = math_asr_s32(vacc0, vshift); + int32_t vout1 = math_asr_s32(vacc1, vshift); + int32_t vout2 = math_asr_s32(vacc2, vshift); + int32_t vout3 = math_asr_s32(vacc3, vshift); vout0 = math_max_s32(vout0, voutput_min_less_zero_point); vout1 = math_max_s32(vout1, voutput_min_less_zero_point); @@ -71,7 +71,7 @@ void xnn_qu8_vaddc_minmax_ukernel__scalar_x4( const int32_t va = *input_a++; const int32_t vacc = vbias + va * va_multiplier; - int32_t vout = asr_s32(vacc, vshift); + int32_t vout = math_asr_s32(vacc, vshift); vout = math_max_s32(vout, voutput_min_less_zero_point); vout = math_min_s32(vout, voutput_max_less_zero_point); *output++ = (uint8_t) (vout + voutput_zero_point); diff --git a/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c b/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c index cafbfd2e6..d78c12a62 100644 --- a/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c +++ b/src/qu8-vcvt/gen/vcvt-armv6simd-x4.c @@ -38,10 +38,10 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x4( int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); - vacc0 = __usat(asr_s32(vacc0, 1), 8); - vacc1 = __usat(asr_s32(vacc1, 1), 8); - vacc2 = __usat(asr_s32(vacc2, 1), 8); - vacc3 = __usat(asr_s32(vacc3, 1), 8); + vacc0 = __usat(math_asr_s32(vacc0, 1), 8); + vacc1 = __usat(math_asr_s32(vacc1, 1), 8); + vacc2 = __usat(math_asr_s32(vacc2, 1), 8); + vacc3 = __usat(math_asr_s32(vacc3, 1), 8); y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; @@ -59,13 +59,13 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x4( int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); - vacc0 = __usat(asr_s32(vacc0, 1), 8); - vacc1 = __usat(asr_s32(vacc1, 1), 8); + vacc0 = __usat(math_asr_s32(vacc0, 1), 8); + vacc1 = __usat(math_asr_s32(vacc1, 1), 8); if (n & (2 * sizeof(uint8_t))) { y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; - vacc0 = __usat(asr_s32(vacc2, 1), 8); + vacc0 = __usat(math_asr_s32(vacc2, 1), 8); y += 2; } if (n & (1 * sizeof(uint8_t))) { diff --git a/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c b/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c index cb24c6167..bd57725f4 100644 --- a/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c +++ b/src/qu8-vcvt/gen/vcvt-armv6simd-x8.c @@ -45,14 +45,14 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x8( int32_t vacc6 = __smlawt(vmultiplier, vx46, vbias); int32_t vacc7 = __smlawt(vmultiplier, vx57, vbias); - vacc0 = __usat(asr_s32(vacc0, 1), 8); - vacc1 = __usat(asr_s32(vacc1, 1), 8); - vacc2 = __usat(asr_s32(vacc2, 1), 8); - vacc3 = __usat(asr_s32(vacc3, 1), 8); - vacc4 = __usat(asr_s32(vacc4, 1), 8); - vacc5 = __usat(asr_s32(vacc5, 1), 8); - vacc6 = __usat(asr_s32(vacc6, 1), 8); - vacc7 = __usat(asr_s32(vacc7, 1), 8); + vacc0 = __usat(math_asr_s32(vacc0, 1), 8); + vacc1 = __usat(math_asr_s32(vacc1, 1), 8); + vacc2 = __usat(math_asr_s32(vacc2, 1), 8); + vacc3 = __usat(math_asr_s32(vacc3, 1), 8); + vacc4 = __usat(math_asr_s32(vacc4, 1), 8); + vacc5 = __usat(math_asr_s32(vacc5, 1), 8); + vacc6 = __usat(math_asr_s32(vacc6, 1), 8); + vacc7 = __usat(math_asr_s32(vacc7, 1), 8); y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; @@ -76,10 +76,10 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x8( int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); - vacc0 = __usat(asr_s32(vacc0, 1), 8); - vacc1 = __usat(asr_s32(vacc1, 1), 8); - vacc2 = __usat(asr_s32(vacc2, 1), 8); - vacc3 = __usat(asr_s32(vacc3, 1), 8); + vacc0 = __usat(math_asr_s32(vacc0, 1), 8); + vacc1 = __usat(math_asr_s32(vacc1, 1), 8); + vacc2 = __usat(math_asr_s32(vacc2, 1), 8); + vacc3 = __usat(math_asr_s32(vacc3, 1), 8); y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; @@ -97,13 +97,13 @@ void xnn_qu8_vcvt_ukernel__armv6simd_x8( int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); - vacc0 = __usat(asr_s32(vacc0, 1), 8); - vacc1 = __usat(asr_s32(vacc1, 1), 8); + vacc0 = __usat(math_asr_s32(vacc0, 1), 8); + vacc1 = __usat(math_asr_s32(vacc1, 1), 8); if (n & (2 * sizeof(uint8_t))) { y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; - vacc0 = __usat(asr_s32(vacc2, 1), 8); + vacc0 = __usat(math_asr_s32(vacc2, 1), 8); y += 2; } if (n & (1 * sizeof(uint8_t))) { diff --git a/src/qu8-vcvt/gen/vcvt-scalar-x1.c b/src/qu8-vcvt/gen/vcvt-scalar-x1.c index b4fc8fa63..9c99ce346 100644 --- a/src/qu8-vcvt/gen/vcvt-scalar-x1.c +++ b/src/qu8-vcvt/gen/vcvt-scalar-x1.c @@ -25,7 +25,7 @@ void xnn_qu8_vcvt_ukernel__scalar_x1( int32_t vacc = *x++; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y++ = (uint8_t) vout; diff --git a/src/qu8-vcvt/gen/vcvt-scalar-x2.c b/src/qu8-vcvt/gen/vcvt-scalar-x2.c index 1ceacd9b2..f5399fbf1 100644 --- a/src/qu8-vcvt/gen/vcvt-scalar-x2.c +++ b/src/qu8-vcvt/gen/vcvt-scalar-x2.c @@ -29,8 +29,8 @@ void xnn_qu8_vcvt_ukernel__scalar_x2( vacc0 = vbias + vacc0 * vmultiplier; vacc1 = vbias + vacc1 * vmultiplier; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); vout0 = math_max_s32(vout0, 0); vout1 = math_max_s32(vout1, 0); @@ -46,7 +46,7 @@ void xnn_qu8_vcvt_ukernel__scalar_x2( int32_t vacc = *x; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y = (uint8_t) vout; diff --git a/src/qu8-vcvt/gen/vcvt-scalar-x4.c b/src/qu8-vcvt/gen/vcvt-scalar-x4.c index f1568973b..d24df8932 100644 --- a/src/qu8-vcvt/gen/vcvt-scalar-x4.c +++ b/src/qu8-vcvt/gen/vcvt-scalar-x4.c @@ -33,10 +33,10 @@ void xnn_qu8_vcvt_ukernel__scalar_x4( vacc2 = vbias + vacc2 * vmultiplier; vacc3 = vbias + vacc3 * vmultiplier; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); - int32_t vout2 = asr_s32(vacc2, 8); - int32_t vout3 = asr_s32(vacc3, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); + int32_t vout2 = math_asr_s32(vacc2, 8); + int32_t vout3 = math_asr_s32(vacc3, 8); vout0 = math_max_s32(vout0, 0); vout1 = math_max_s32(vout1, 0); @@ -59,7 +59,7 @@ void xnn_qu8_vcvt_ukernel__scalar_x4( int32_t vacc = *x++; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y++ = (uint8_t) vout; diff --git a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c index 7cc3c3067..6f0487e7f 100644 --- a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c +++ b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x4.c @@ -44,10 +44,10 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x4( int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias); - vacc0 = __usat(asr_s32(vacc0, 8), 8); - vacc1 = __usat(asr_s32(vacc1, 8), 8); - vacc2 = __usat(asr_s32(vacc2, 8), 8); - vacc3 = __usat(asr_s32(vacc3, 8), 8); + vacc0 = __usat(math_asr_s32(vacc0, 8), 8); + vacc1 = __usat(math_asr_s32(vacc1, 8), 8); + vacc2 = __usat(math_asr_s32(vacc2, 8), 8); + vacc3 = __usat(math_asr_s32(vacc3, 8), 8); y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; @@ -70,13 +70,13 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x4( int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias); const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); - vacc0 = __usat(asr_s32(vacc0, 8), 8); - vacc1 = __usat(asr_s32(vacc1, 8), 8); + vacc0 = __usat(math_asr_s32(vacc0, 8), 8); + vacc1 = __usat(math_asr_s32(vacc1, 8), 8); if (n & (2 * sizeof(uint8_t))) { y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; - vacc0 = __usat(asr_s32(vacc2, 8), 8); + vacc0 = __usat(math_asr_s32(vacc2, 8), 8); y += 2; } if (n & (1 * sizeof(uint8_t))) { diff --git a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c index 565bb1e5f..8f3142080 100644 --- a/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c +++ b/src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c @@ -55,14 +55,14 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x8( int32_t vacc6 = __smlatt(vmultiplier46, vx46, vbias); int32_t vacc7 = __smlatt(vmultiplier57, vx57, vbias); - vacc0 = __usat(asr_s32(vacc0, 8), 8); - vacc1 = __usat(asr_s32(vacc1, 8), 8); - vacc2 = __usat(asr_s32(vacc2, 8), 8); - vacc3 = __usat(asr_s32(vacc3, 8), 8); - vacc4 = __usat(asr_s32(vacc4, 8), 8); - vacc5 = __usat(asr_s32(vacc5, 8), 8); - vacc6 = __usat(asr_s32(vacc6, 8), 8); - vacc7 = __usat(asr_s32(vacc7, 8), 8); + vacc0 = __usat(math_asr_s32(vacc0, 8), 8); + vacc1 = __usat(math_asr_s32(vacc1, 8), 8); + vacc2 = __usat(math_asr_s32(vacc2, 8), 8); + vacc3 = __usat(math_asr_s32(vacc3, 8), 8); + vacc4 = __usat(math_asr_s32(vacc4, 8), 8); + vacc5 = __usat(math_asr_s32(vacc5, 8), 8); + vacc6 = __usat(math_asr_s32(vacc6, 8), 8); + vacc7 = __usat(math_asr_s32(vacc7, 8), 8); y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; @@ -91,10 +91,10 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x8( int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); int32_t vacc3 = __smlatt(vmultiplier13, vx13, vbias); - vacc0 = __usat(asr_s32(vacc0, 8), 8); - vacc1 = __usat(asr_s32(vacc1, 8), 8); - vacc2 = __usat(asr_s32(vacc2, 8), 8); - vacc3 = __usat(asr_s32(vacc3, 8), 8); + vacc0 = __usat(math_asr_s32(vacc0, 8), 8); + vacc1 = __usat(math_asr_s32(vacc1, 8), 8); + vacc2 = __usat(math_asr_s32(vacc2, 8), 8); + vacc3 = __usat(math_asr_s32(vacc3, 8), 8); y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; @@ -117,13 +117,13 @@ void xnn_qu8_vlrelu_ukernel__armv6simd_x8( int32_t vacc1 = __smlabb(vmultiplier13, vx13, vbias); const int32_t vacc2 = __smlatt(vmultiplier02, vx02, vbias); - vacc0 = __usat(asr_s32(vacc0, 8), 8); - vacc1 = __usat(asr_s32(vacc1, 8), 8); + vacc0 = __usat(math_asr_s32(vacc0, 8), 8); + vacc1 = __usat(math_asr_s32(vacc1, 8), 8); if (n & (2 * sizeof(uint8_t))) { y[0] = (uint8_t) vacc0; y[1] = (uint8_t) vacc1; - vacc0 = __usat(asr_s32(vacc2, 8), 8); + vacc0 = __usat(math_asr_s32(vacc2, 8), 8); y += 2; } if (n & (1 * sizeof(uint8_t))) { diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c index 33e9176d7..c0ce8a70b 100644 --- a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c +++ b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x1.c @@ -25,10 +25,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x1( const int32_t vbias = params->scalar_andxor.bias; do { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y++ = (uint8_t) vout; diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c index 8e4f64fab..81eb91fdf 100644 --- a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c +++ b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x2.c @@ -31,8 +31,8 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x2( vacc0 -= vinput_zero_point; vacc1 -= vinput_zero_point; - int32_t vmultiplier0 = asr_s32(vacc0, 31); - int32_t vmultiplier1 = asr_s32(vacc1, 31); + int32_t vmultiplier0 = math_asr_s32(vacc0, 31); + int32_t vmultiplier1 = math_asr_s32(vacc1, 31); vmultiplier0 &= vmultiplier_diff; vmultiplier1 &= vmultiplier_diff; @@ -43,8 +43,8 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x2( vacc0 = vbias + vacc0 * vmultiplier0; vacc1 = vbias + vacc1 * vmultiplier1; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); vout0 = math_max_s32(vout0, 0); vout1 = math_max_s32(vout1, 0); @@ -58,10 +58,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x2( } if XNN_UNLIKELY(n != 0) { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y = (uint8_t) vout; diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c index 7b52bb64e..da80ee8bb 100644 --- a/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c +++ b/src/qu8-vlrelu/gen/vlrelu-scalar-andxor-x4.c @@ -35,10 +35,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x4( vacc2 -= vinput_zero_point; vacc3 -= vinput_zero_point; - int32_t vmultiplier0 = asr_s32(vacc0, 31); - int32_t vmultiplier1 = asr_s32(vacc1, 31); - int32_t vmultiplier2 = asr_s32(vacc2, 31); - int32_t vmultiplier3 = asr_s32(vacc3, 31); + int32_t vmultiplier0 = math_asr_s32(vacc0, 31); + int32_t vmultiplier1 = math_asr_s32(vacc1, 31); + int32_t vmultiplier2 = math_asr_s32(vacc2, 31); + int32_t vmultiplier3 = math_asr_s32(vacc3, 31); vmultiplier0 &= vmultiplier_diff; vmultiplier1 &= vmultiplier_diff; @@ -55,10 +55,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x4( vacc2 = vbias + vacc2 * vmultiplier2; vacc3 = vbias + vacc3 * vmultiplier3; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); - int32_t vout2 = asr_s32(vacc2, 8); - int32_t vout3 = asr_s32(vacc3, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); + int32_t vout2 = math_asr_s32(vacc2, 8); + int32_t vout3 = math_asr_s32(vacc3, 8); vout0 = math_max_s32(vout0, 0); vout1 = math_max_s32(vout1, 0); @@ -79,10 +79,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_andxor_x4( if XNN_UNLIKELY(n != 0) { do { int32_t vacc = (int32_t) *x++ - vinput_zero_point; - const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & asr_s32(vacc, 31)); + const int32_t vmultiplier = vmultiplier_base ^ (vmultiplier_diff & math_asr_s32(vacc, 31)); vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y++ = (uint8_t) vout; diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c index 8db62d4ba..479336be4 100644 --- a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c +++ b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x1.c @@ -28,7 +28,7 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x1( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y++ = (uint8_t) vout; diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c index eb6164b26..2ba144e7f 100644 --- a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c +++ b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x2.c @@ -37,8 +37,8 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x2( vacc0 = vbias + vacc0 * vmultiplier0; vacc1 = vbias + vacc1 * vmultiplier1; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); vout0 = math_max_s32(vout0, 0); vout1 = math_max_s32(vout1, 0); @@ -55,7 +55,7 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x2( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y = (uint8_t) vout; diff --git a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c index 6d5f36165..4300bd2c0 100644 --- a/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c +++ b/src/qu8-vlrelu/gen/vlrelu-scalar-select-x4.c @@ -45,10 +45,10 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x4( vacc2 = vbias + vacc2 * vmultiplier2; vacc3 = vbias + vacc3 * vmultiplier3; - int32_t vout0 = asr_s32(vacc0, 8); - int32_t vout1 = asr_s32(vacc1, 8); - int32_t vout2 = asr_s32(vacc2, 8); - int32_t vout3 = asr_s32(vacc3, 8); + int32_t vout0 = math_asr_s32(vacc0, 8); + int32_t vout1 = math_asr_s32(vacc1, 8); + int32_t vout2 = math_asr_s32(vacc2, 8); + int32_t vout3 = math_asr_s32(vacc3, 8); vout0 = math_max_s32(vout0, 0); vout1 = math_max_s32(vout1, 0); @@ -72,7 +72,7 @@ void xnn_qu8_vlrelu_ukernel__scalar_select_x4( const int32_t vmultiplier = XNN_UNPREDICTABLE(vacc >= 0) ? vpositive_multiplier : vnegative_multiplier; vacc = vbias + vacc * vmultiplier; - int32_t vout = asr_s32(vacc, 8); + int32_t vout = math_asr_s32(vacc, 8); vout = math_max_s32(vout, 0); vout = math_min_s32(vout, 255); *y++ = (uint8_t) vout; diff --git a/src/s16-window/gen/scalar-x1.c b/src/s16-window/gen/scalar-x1.c index fcf4323b7..623f08390 100644 --- a/src/s16-window/gen/scalar-x1.c +++ b/src/s16-window/gen/scalar-x1.c @@ -39,7 +39,7 @@ void xnn_s16_window_ukernel__scalar_x1( int32_t vout = ((int32_t) input[0] * (int32_t) w[0]); ++input; ++w; - vout = asr_s32(vout, shift); + vout = math_asr_s32(vout, shift); vout = math_max_s32(vout, INT16_MIN); vout = math_min_s32(vout, INT16_MAX); output[0] = (int16_t)(vout); diff --git a/src/s16-window/gen/scalar-x2.c b/src/s16-window/gen/scalar-x2.c index d0dfd5928..39a5b48ad 100644 --- a/src/s16-window/gen/scalar-x2.c +++ b/src/s16-window/gen/scalar-x2.c @@ -45,8 +45,8 @@ void xnn_s16_window_ukernel__scalar_x2( int32_t vout0 = (int32_t) vi0 * (int32_t) w0; int32_t vout1 = (int32_t) vi1 * (int32_t) w1; - vout0 = asr_s32(vout0, shift); - vout1 = asr_s32(vout1, shift); + vout0 = math_asr_s32(vout0, shift); + vout1 = math_asr_s32(vout1, shift); vout0 = math_max_s32(vout0, INT16_MIN); vout1 = math_max_s32(vout1, INT16_MIN); @@ -65,7 +65,7 @@ void xnn_s16_window_ukernel__scalar_x2( int32_t vout = ((int32_t) input[0] * (int32_t) w[0]); ++input; ++w; - vout = asr_s32(vout, shift); + vout = math_asr_s32(vout, shift); vout = math_max_s32(vout, INT16_MIN); vout = math_min_s32(vout, INT16_MAX); output[0] = (int16_t)(vout); diff --git a/src/s16-window/gen/scalar-x3.c b/src/s16-window/gen/scalar-x3.c index ab5901e8c..6bd194224 100644 --- a/src/s16-window/gen/scalar-x3.c +++ b/src/s16-window/gen/scalar-x3.c @@ -48,9 +48,9 @@ void xnn_s16_window_ukernel__scalar_x3( int32_t vout1 = (int32_t) vi1 * (int32_t) w1; int32_t vout2 = (int32_t) vi2 * (int32_t) w2; - vout0 = asr_s32(vout0, shift); - vout1 = asr_s32(vout1, shift); - vout2 = asr_s32(vout2, shift); + vout0 = math_asr_s32(vout0, shift); + vout1 = math_asr_s32(vout1, shift); + vout2 = math_asr_s32(vout2, shift); vout0 = math_max_s32(vout0, INT16_MIN); vout1 = math_max_s32(vout1, INT16_MIN); @@ -72,7 +72,7 @@ void xnn_s16_window_ukernel__scalar_x3( int32_t vout = ((int32_t) input[0] * (int32_t) w[0]); ++input; ++w; - vout = asr_s32(vout, shift); + vout = math_asr_s32(vout, shift); vout = math_max_s32(vout, INT16_MIN); vout = math_min_s32(vout, INT16_MAX); output[0] = (int16_t)(vout); diff --git a/src/s16-window/gen/scalar-x4.c b/src/s16-window/gen/scalar-x4.c index 100a30af6..081712590 100644 --- a/src/s16-window/gen/scalar-x4.c +++ b/src/s16-window/gen/scalar-x4.c @@ -51,10 +51,10 @@ void xnn_s16_window_ukernel__scalar_x4( int32_t vout2 = (int32_t) vi2 * (int32_t) w2; int32_t vout3 = (int32_t) vi3 * (int32_t) w3; - vout0 = asr_s32(vout0, shift); - vout1 = asr_s32(vout1, shift); - vout2 = asr_s32(vout2, shift); - vout3 = asr_s32(vout3, shift); + vout0 = math_asr_s32(vout0, shift); + vout1 = math_asr_s32(vout1, shift); + vout2 = math_asr_s32(vout2, shift); + vout3 = math_asr_s32(vout3, shift); vout0 = math_max_s32(vout0, INT16_MIN); vout1 = math_max_s32(vout1, INT16_MIN); @@ -79,7 +79,7 @@ void xnn_s16_window_ukernel__scalar_x4( int32_t vout = ((int32_t) input[0] * (int32_t) w[0]); ++input; ++w; - vout = asr_s32(vout, shift); + vout = math_asr_s32(vout, shift); vout = math_max_s32(vout, INT16_MIN); vout = math_min_s32(vout, INT16_MAX); output[0] = (int16_t)(vout); diff --git a/src/s16-window/scalar.c.in b/src/s16-window/scalar.c.in index d88835f5d..229cf9a8c 100644 --- a/src/s16-window/scalar.c.in +++ b/src/s16-window/scalar.c.in @@ -44,7 +44,7 @@ void xnn_s16_window_ukernel__scalar_x${CHANNEL_TILE}( int32_t vout${C} = (int32_t) vi${C} * (int32_t) w${C}; $for C in range(CHANNEL_TILE): - vout${C} = asr_s32(vout${C}, shift); + vout${C} = math_asr_s32(vout${C}, shift); $for C in range(CHANNEL_TILE): vout${C} = math_max_s32(vout${C}, INT16_MIN); @@ -63,7 +63,7 @@ void xnn_s16_window_ukernel__scalar_x${CHANNEL_TILE}( int32_t vout = ((int32_t) input[0] * (int32_t) w[0]); ++input; ++w; - vout = asr_s32(vout, shift); + vout = math_asr_s32(vout, shift); vout = math_max_s32(vout, INT16_MIN); vout = math_min_s32(vout, INT16_MAX); output[0] = (int16_t)(vout); diff --git a/src/s8-ibilinear/gen/scalar-c1.c b/src/s8-ibilinear/gen/scalar-c1.c index 4d3c19421..f2246e766 100644 --- a/src/s8-ibilinear/gen/scalar-c1.c +++ b/src/s8-ibilinear/gen/scalar-c1.c @@ -55,7 +55,7 @@ void xnn_s8_ibilinear_ukernel__scalar_c1( const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; diff --git a/src/s8-ibilinear/gen/scalar-c2.c b/src/s8-ibilinear/gen/scalar-c2.c index d435fa0ac..b1918ddb0 100644 --- a/src/s8-ibilinear/gen/scalar-c2.c +++ b/src/s8-ibilinear/gen/scalar-c2.c @@ -69,8 +69,8 @@ void xnn_s8_ibilinear_ukernel__scalar_c2( const int32_t vacc0 = (int32_t) ((uint32_t) vt0 << 11) + vd0 * valphav; const int32_t vacc1 = (int32_t) ((uint32_t) vt1 << 11) + vd1 * valphav; - const int32_t vo0 = asr_s32(vacc0 + vrounding, 22); - const int32_t vo1 = asr_s32(vacc1 + vrounding, 22); + const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22); + const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22); output[0] = (int8_t) vo0; output[1] = (int8_t) vo1; @@ -92,7 +92,7 @@ void xnn_s8_ibilinear_ukernel__scalar_c2( const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; } diff --git a/src/s8-ibilinear/gen/scalar-c4.c b/src/s8-ibilinear/gen/scalar-c4.c index 8ff553f03..b92c079df 100644 --- a/src/s8-ibilinear/gen/scalar-c4.c +++ b/src/s8-ibilinear/gen/scalar-c4.c @@ -89,10 +89,10 @@ void xnn_s8_ibilinear_ukernel__scalar_c4( const int32_t vacc2 = (int32_t) ((uint32_t) vt2 << 11) + vd2 * valphav; const int32_t vacc3 = (int32_t) ((uint32_t) vt3 << 11) + vd3 * valphav; - const int32_t vo0 = asr_s32(vacc0 + vrounding, 22); - const int32_t vo1 = asr_s32(vacc1 + vrounding, 22); - const int32_t vo2 = asr_s32(vacc2 + vrounding, 22); - const int32_t vo3 = asr_s32(vacc3 + vrounding, 22); + const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22); + const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22); + const int32_t vo2 = math_asr_s32(vacc2 + vrounding, 22); + const int32_t vo3 = math_asr_s32(vacc3 + vrounding, 22); output[0] = (int8_t) vo0; output[1] = (int8_t) vo1; @@ -116,7 +116,7 @@ void xnn_s8_ibilinear_ukernel__scalar_c4( const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; } diff --git a/src/s8-ibilinear/scalar.c.in b/src/s8-ibilinear/scalar.c.in index 266d7fefb..b29d08a7d 100644 --- a/src/s8-ibilinear/scalar.c.in +++ b/src/s8-ibilinear/scalar.c.in @@ -66,7 +66,7 @@ void xnn_${DATATYPE.lower()}_ibilinear_ukernel__scalar_c${CHANNEL_TILE}${"" if P const int32_t vacc${ABC[C]} = (int32_t) ((uint32_t) vt${ABC[C]} << 11) + vd${ABC[C]} * valphav; $for C in range(CHANNEL_TILE): - const int32_t vo${ABC[C]} = asr_s32(vacc${ABC[C]} + vrounding, 22); + const int32_t vo${ABC[C]} = math_asr_s32(vacc${ABC[C]} + vrounding, 22); $for C in range(CHANNEL_TILE): output[${C}] = (${XINT8_T}) vo${ABC[C]}; @@ -88,7 +88,7 @@ void xnn_${DATATYPE.lower()}_ibilinear_ukernel__scalar_c${CHANNEL_TILE}${"" if P const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; } @@ -109,7 +109,7 @@ void xnn_${DATATYPE.lower()}_ibilinear_ukernel__scalar_c${CHANNEL_TILE}${"" if P const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; diff --git a/src/u8-ibilinear/gen/scalar-c1.c b/src/u8-ibilinear/gen/scalar-c1.c index a17287fc6..de03b9a01 100644 --- a/src/u8-ibilinear/gen/scalar-c1.c +++ b/src/u8-ibilinear/gen/scalar-c1.c @@ -55,7 +55,7 @@ void xnn_u8_ibilinear_ukernel__scalar_c1( const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; diff --git a/src/u8-ibilinear/gen/scalar-c2.c b/src/u8-ibilinear/gen/scalar-c2.c index 33b18d821..5f398122c 100644 --- a/src/u8-ibilinear/gen/scalar-c2.c +++ b/src/u8-ibilinear/gen/scalar-c2.c @@ -69,8 +69,8 @@ void xnn_u8_ibilinear_ukernel__scalar_c2( const int32_t vacc0 = (int32_t) ((uint32_t) vt0 << 11) + vd0 * valphav; const int32_t vacc1 = (int32_t) ((uint32_t) vt1 << 11) + vd1 * valphav; - const int32_t vo0 = asr_s32(vacc0 + vrounding, 22); - const int32_t vo1 = asr_s32(vacc1 + vrounding, 22); + const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22); + const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22); output[0] = (uint8_t) vo0; output[1] = (uint8_t) vo1; @@ -92,7 +92,7 @@ void xnn_u8_ibilinear_ukernel__scalar_c2( const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; } diff --git a/src/u8-ibilinear/gen/scalar-c4.c b/src/u8-ibilinear/gen/scalar-c4.c index 318995666..438cfe3ce 100644 --- a/src/u8-ibilinear/gen/scalar-c4.c +++ b/src/u8-ibilinear/gen/scalar-c4.c @@ -89,10 +89,10 @@ void xnn_u8_ibilinear_ukernel__scalar_c4( const int32_t vacc2 = (int32_t) ((uint32_t) vt2 << 11) + vd2 * valphav; const int32_t vacc3 = (int32_t) ((uint32_t) vt3 << 11) + vd3 * valphav; - const int32_t vo0 = asr_s32(vacc0 + vrounding, 22); - const int32_t vo1 = asr_s32(vacc1 + vrounding, 22); - const int32_t vo2 = asr_s32(vacc2 + vrounding, 22); - const int32_t vo3 = asr_s32(vacc3 + vrounding, 22); + const int32_t vo0 = math_asr_s32(vacc0 + vrounding, 22); + const int32_t vo1 = math_asr_s32(vacc1 + vrounding, 22); + const int32_t vo2 = math_asr_s32(vacc2 + vrounding, 22); + const int32_t vo3 = math_asr_s32(vacc3 + vrounding, 22); output[0] = (uint8_t) vo0; output[1] = (uint8_t) vo1; @@ -116,7 +116,7 @@ void xnn_u8_ibilinear_ukernel__scalar_c4( const int32_t vacc = (int32_t) ((uint32_t) vt << 11) + vd * valphav; - const int32_t vo = asr_s32(vacc + vrounding, 22); + const int32_t vo = math_asr_s32(vacc + vrounding, 22); *output++ = vo; } diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h index 982af7638..deefacf6f 100644 --- a/src/xnnpack/math.h +++ b/src/xnnpack/math.h @@ -155,7 +155,7 @@ XNN_INLINE static float math_nonsign_mask_f32() { #endif XNN_IGNORE_SHIFT_BASE_UB -XNN_INLINE static int32_t asr_s32(int32_t x, uint32_t n) { +XNN_INLINE static int32_t math_asr_s32(int32_t x, uint32_t n) { #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND #if XNN_ARCH_X86_64 || XNN_ARCH_ARM64 return (int32_t) ((uint64_t) (int64_t) x >> n); @@ -168,7 +168,7 @@ XNN_INLINE static int32_t asr_s32(int32_t x, uint32_t n) { } XNN_IGNORE_SHIFT_BASE_UB -XNN_INLINE static int64_t asr_s64(int64_t x, uint32_t n) { +XNN_INLINE static int64_t math_asr_s64(int64_t x, uint32_t n) { #ifdef XNN_USE_SHIFT_BASE_UB_WORKAROUND return x >= 0 ? x >> n : ~(~x >> n); #else diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h index 18ddb9c44..a556acd33 100644 --- a/src/xnnpack/requantization.h +++ b/src/xnnpack/requantization.h @@ -173,7 +173,7 @@ static inline int8_t xnn_qs8_requantize_rndnu( const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point; const int64_t abs_prescaled_input = (int64_t) input * (int64_t) multiplier; - int32_t output = (int32_t) asr_s64(abs_prescaled_input + rounding, shift); + int32_t output = (int32_t) math_asr_s64(abs_prescaled_input + rounding, shift); output = math_max_s32(output, min_less_zero_point); output = math_min_s32(output, max_less_zero_point); return (int8_t) (output + (int32_t) zero_point); @@ -200,7 +200,7 @@ static inline uint8_t xnn_qu8_requantize_rndnu( const int32_t max_less_zero_point = (int32_t) max - (int32_t) zero_point; const int64_t abs_prescaled_input = (int64_t) input * (int64_t) multiplier; - int32_t output = (int32_t) asr_s64(abs_prescaled_input + rounding, shift); + int32_t output = (int32_t) math_asr_s64(abs_prescaled_input + rounding, shift); output = math_max_s32(output, min_less_zero_point); output = math_min_s32(output, max_less_zero_point); return (uint8_t) (output + (int32_t) zero_point); @@ -214,7 +214,7 @@ static inline uint8_t xnn_qu8_quantize_add( int32_t acc = params.scalar.bias + (int32_t) (uint32_t) a * params.scalar.a_multiplier + (int32_t) (uint32_t) b * params.scalar.b_multiplier; // Shift right with rounding away from zero. - acc = asr_s32(acc, params.scalar.shift); + acc = math_asr_s32(acc, params.scalar.shift); // Clamp and add output zero point. acc = math_max_s32(acc, params.scalar.output_min_less_zero_point); @@ -230,7 +230,7 @@ static inline int8_t xnn_qs8_quantize_add( int32_t acc = params.scalar.bias + (int32_t) a * params.scalar.a_multiplier + (int32_t) b * params.scalar.b_multiplier; // Shift right with rounding away from zero. - acc = asr_s32(acc, params.scalar.shift); + acc = math_asr_s32(acc, params.scalar.shift); // Clamp and add output zero point. acc = math_max_s32(acc, params.scalar.output_min_less_zero_point); diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h index a11a7d267..34cc4acbd 100644 --- a/test/ibilinear-microkernel-tester.h +++ b/test/ibilinear-microkernel-tester.h @@ -229,7 +229,7 @@ class IBilinearMicrokernelTester { for (size_t c = 0; c < channels(); c++) { const int32_t alpha_h = packed_weights[i * 2 + 0]; const int32_t alpha_v = packed_weights[i * 2 + 1]; - const int32_t acc = asr_s32( + const int32_t acc = math_asr_s32( int32_t(indirection[i * 4 + 0][c + input_offset()]) * (2048 - alpha_h) * (2048 - alpha_v) + int32_t(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (2048 - alpha_v) + int32_t(indirection[i * 4 + 2][c + input_offset()]) * (2048 - alpha_h) * alpha_v + diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h index 1609cbb21..0961fbda4 100644 --- a/test/vcvt-microkernel-tester.h +++ b/test/vcvt-microkernel-tester.h @@ -266,7 +266,7 @@ class VCvtMicrokernelTester { const int32_t multiplier = (int32_t) lrintf(-256.0f * scale()); for (size_t i = 0; i < batch_size(); i++) { const int32_t input_value = (input_zero_point() - input[i]) << 7; - int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); + int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); output_value = std::min<int32_t>(output_value, std::numeric_limits<int8_t>::max()); output_value = std::max<int32_t>(output_value, std::numeric_limits<int8_t>::min()); output_ref[i] = static_cast<int8_t>(output_value); @@ -345,7 +345,7 @@ class VCvtMicrokernelTester { const int32_t multiplier = (int32_t) lrintf(-256.0f * scale()); for (size_t i = 0; i < batch_size(); i++) { const int32_t input_value = (input_zero_point() - input[i]) << 7; - int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); + int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); output_value = std::min<int32_t>(output_value, std::numeric_limits<uint8_t>::max()); output_value = std::max<int32_t>(output_value, std::numeric_limits<uint8_t>::min()); output_ref[i] = static_cast<uint8_t>(output_value); diff --git a/test/vlrelu-microkernel-tester.h b/test/vlrelu-microkernel-tester.h index d73fdbf9b..ec9ed66ba 100644 --- a/test/vlrelu-microkernel-tester.h +++ b/test/vlrelu-microkernel-tester.h @@ -113,7 +113,7 @@ class VLReLUMicrokernelTester { for (size_t i = 0; i < batch_size(); i++) { const int32_t input_value = (input_zero_point() - input[i]) << 7; const int32_t multiplier = input_value <= 0 ? positive_multiplier : negative_multiplier; - int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); + int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); output_value = std::min<int32_t>(output_value, std::numeric_limits<int8_t>::max()); output_value = std::max<int32_t>(output_value, std::numeric_limits<int8_t>::min()); output_ref[i] = static_cast<int8_t>(output_value); @@ -158,7 +158,7 @@ class VLReLUMicrokernelTester { for (size_t i = 0; i < batch_size(); i++) { const int32_t input_value = (input_zero_point() - input[i]) << 7; const int32_t multiplier = input_value <= 0 ? positive_multiplier : negative_multiplier; - int32_t output_value = asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); + int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); output_value = std::min<int32_t>(output_value, std::numeric_limits<uint8_t>::max()); output_value = std::max<int32_t>(output_value, std::numeric_limits<uint8_t>::min()); output_ref[i] = static_cast<uint8_t>(output_value); |