diff options
author | Marat Dukhan <maratek@google.com> | 2020-09-21 17:00:27 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2020-09-21 17:00:58 -0700 |
commit | 80a8ac59849bfdae8d2e1409f5642baa502c0b9e (patch) | |
tree | 2c7f222fd4eb33aabf81fe34fb2b5a5e145fd66d | |
parent | 9490613d1da6d21e7d03908800f9437a9375bfd0 (diff) | |
download | XNNPACK-80a8ac59849bfdae8d2e1409f5642baa502c0b9e.tar.gz |
AVX512F LUT-based with SCALEF Exp evaluation stubs
PiperOrigin-RevId: 332967630
-rw-r--r-- | BUILD.bazel | 2 | ||||
-rwxr-xr-x | CMakeLists.txt | 4 | ||||
-rw-r--r-- | eval/f32-exp.cc | 8 | ||||
-rw-r--r-- | src/math/exp-avx512f-lut16-p3-perm-scalef.c | 71 | ||||
-rw-r--r-- | src/math/exp-avx512f-lut32-p2-perm2-scalef.c | 75 | ||||
-rw-r--r-- | src/xnnpack/math-stubs.h | 2 |
6 files changed, 161 insertions, 1 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index 70bb15ead..f63f01292 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -2618,7 +2618,9 @@ AVX512F_UKERNELS = [ "src/math/exp-avx512f-p5-scalef.c", "src/math/exp-avx512f-p5.c", "src/math/exp-avx512f-lut16-p3-perm.c", + "src/math/exp-avx512f-lut16-p3-perm-scalef.c", "src/math/exp-avx512f-lut32-p2-perm2.c", + "src/math/exp-avx512f-lut32-p2-perm2-scalef.c", "src/math/extexp-avx512f-p5.c", "src/math/sigmoid-avx512f-rr1-p5-div.c", "src/math/sigmoid-avx512f-rr1-p5-nr1fma.c", diff --git a/CMakeLists.txt b/CMakeLists.txt index b761a06d8..a47087bf8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2092,10 +2092,12 @@ SET(XNNPACK_AVX512F_MICROKERNEL_SRCS src/f32-vunary/gen/vneg-avx512f-x32.c src/f32-vunary/gen/vsqr-avx512f-x16.c src/f32-vunary/gen/vsqr-avx512f-x32.c - src/math/exp-avx512f-p5-scalef.c src/math/exp-avx512f-p5.c + src/math/exp-avx512f-p5-scalef.c src/math/exp-avx512f-lut16-p3-perm.c + src/math/exp-avx512f-lut16-p3-perm-scalef.c src/math/exp-avx512f-lut32-p2-perm2.c + src/math/exp-avx512f-lut32-p2-perm2-scalef.c src/math/extexp-avx512f-p5.c src/math/sigmoid-avx512f-rr1-p5-div.c src/math/sigmoid-avx512f-rr1-p5-nr1fma.c diff --git a/eval/f32-exp.cc b/eval/f32-exp.cc index b61918032..dccb6912d 100644 --- a/eval/f32-exp.cc +++ b/eval/f32-exp.cc @@ -92,9 +92,15 @@ static void ExpError(benchmark::State& state, static void f32_exp__avx512f_lut16_p3_perm(benchmark::State& state) { ExpError(state, xnn_math_f32_exp__avx512f_lut16_p3_perm, 16); } + static void f32_exp__avx512f_lut16_p3_perm_scalef(benchmark::State& state) { + ExpError(state, xnn_math_f32_exp__avx512f_lut16_p3_perm_scalef, 16); + } static void f32_exp__avx512f_lut32_p2_perm2(benchmark::State& state) { ExpError(state, xnn_math_f32_exp__avx512f_lut32_p2_perm2, 16); } + static void f32_exp__avx512f_lut32_p2_perm2_scalef(benchmark::State& state) { + ExpError(state, xnn_math_f32_exp__avx512f_lut32_p2_perm2_scalef, 16); + } static void f32_exp__avx512f_p5(benchmark::State& state) { ExpError(state, xnn_math_f32_exp__avx512f_p5, 16); } @@ -109,7 +115,9 @@ static void ExpError(benchmark::State& state, BENCHMARK(f32_exp__avx2_lut8_p3_perm)->Unit(benchmark::kMillisecond)->Iterations(1); BENCHMARK(f32_exp__avx2_p5)->Unit(benchmark::kMillisecond)->Iterations(1); BENCHMARK(f32_exp__avx512f_lut16_p3_perm)->Unit(benchmark::kMillisecond)->Iterations(1); + BENCHMARK(f32_exp__avx512f_lut16_p3_perm_scalef)->Unit(benchmark::kMillisecond)->Iterations(1); BENCHMARK(f32_exp__avx512f_lut32_p2_perm2)->Unit(benchmark::kMillisecond)->Iterations(1); + BENCHMARK(f32_exp__avx512f_lut32_p2_perm2_scalef)->Unit(benchmark::kMillisecond)->Iterations(1); BENCHMARK(f32_exp__avx512f_p5)->Unit(benchmark::kMillisecond)->Iterations(1); BENCHMARK(f32_exp__avx512f_p5_scalef)->Unit(benchmark::kMillisecond)->Iterations(1); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/math/exp-avx512f-lut16-p3-perm-scalef.c b/src/math/exp-avx512f-lut16-p3-perm-scalef.c new file mode 100644 index 000000000..07eb5a9ac --- /dev/null +++ b/src/math/exp-avx512f-lut16-p3-perm-scalef.c @@ -0,0 +1,71 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <math.h> + +#include <immintrin.h> + +#include <xnnpack/math-stubs.h> + + +void xnn_math_f32_exp__avx512f_lut16_p3_perm_scalef( + size_t n, + const float* input, + float* output) +{ + assert(n % (16 * sizeof(float)) == 0); + + const __m512 vmagic_bias = _mm512_set1_ps(0x1.800000p19f); + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p0f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62e43p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05c61p-29f); + + const __m512 vc2 = _mm512_set1_ps(0x1.00021Ep-1f); + const __m512 vc3 = _mm512_set1_ps(0x1.55559Ap-3f); + const __m512 vtable = _mm512_set_ps( + 0x1.EA4AFAp+0f, 0x1.D5818Ep+0f, 0x1.C199BEp+0f, 0x1.AE89FAp+0f, + 0x1.9C4918p+0f, 0x1.8ACE54p+0f, 0x1.7A1148p+0f, 0x1.6A09E6p+0f, + 0x1.5AB07Ep+0f, 0x1.4BFDAEp+0f, 0x1.3DEA64p+0f, 0x1.306FE0p+0f, + 0x1.2387A6p+0f, 0x1.172B84p+0f, 0x1.0B5586p+0f, 0x1.000000p+0f); + + for (; n != 0; n -= 16 * sizeof(float)) { + const __m512 vx = _mm512_loadu_ps(input); + + // Compute reduced argument n := round(x / log(2), 4). + // We do it by adding a large number (magic bias), which cause rounding of result to an 4 fractional bits, then + // subtracing the large number back. The first addition is combined with multiplication by log2e into a single + // FMA instruction. The trick with adding large number is valid only within certain bounds (|x| <= 2**18), but + // thats ok, because inputs outside of [-103.97207, 88.72283] underflow or overflow expf(x) anyway. We fixup + // the result for such inputs at the very end of the algorithm. + __m512 vn = _mm512_fmadd_ps(vx, vlog2e, vmagic_bias); + + // Use the low 4 bits of n (as integer) for table lookup. + const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable); + + // Subtract the large number back to get final n := round(x / log(2), 4). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - n * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx); + vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt); + + // Compute degree-3 polynomial approximation for exp(t) on [-log(2)/32, log(2)/32]. + // P = l * (1 + t * (1 + t * (c2 + t * c3))) + // = l + l * (t + t * (t * (c2 + t * c3))) + __m512 vp = _mm512_fmadd_ps(vt, vc3, vc2); + vp = _mm512_mul_ps(vp, vt); + vp = _mm512_fmadd_ps(vt, vp, vt); + vp = _mm512_fmadd_ps(vl, vp, vl); + + // Reconstruct the final value as f = exp2(floor(n)) * p. + const __m512 vf = _mm512_scalef_ps(vp, vn); + _mm512_storeu_ps(output, vf); + + input += 16; + output += 16; + } +} diff --git a/src/math/exp-avx512f-lut32-p2-perm2-scalef.c b/src/math/exp-avx512f-lut32-p2-perm2-scalef.c new file mode 100644 index 000000000..562343992 --- /dev/null +++ b/src/math/exp-avx512f-lut32-p2-perm2-scalef.c @@ -0,0 +1,75 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include <assert.h> +#include <math.h> + +#include <immintrin.h> + +#include <xnnpack/math-stubs.h> + + +void xnn_math_f32_exp__avx512f_lut32_p2_perm2_scalef( + size_t n, + const float* input, + float* output) +{ + assert(n % (16 * sizeof(float)) == 0); + + const __m512 vmagic_bias = _mm512_set1_ps(0x1.800000p18f); + const __m512 vlog2e = _mm512_set1_ps(0x1.715476p0f); + const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62e43p-1f); + const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05c61p-29f); + + const __m512 vc1 = _mm512_set1_ps(0x1.0000F6p-0f); + const __m512 vc2 = _mm512_set1_ps(0x1.000000p-1f); + const __m512 vtable_hi = _mm512_set_ps( + 0x1.F50766p+0f, 0x1.EA4AFAp+0f, 0x1.DFC974p+0f, 0x1.D5818Ep+0f, + 0x1.CB720Ep+0f, 0x1.C199BEp+0f, 0x1.B7F770p+0f, 0x1.AE89FAp+0f, + 0x1.A5503Cp+0f, 0x1.9C4918p+0f, 0x1.93737Cp+0f, 0x1.8ACE54p+0f, + 0x1.82589Ap+0f, 0x1.7A1148p+0f, 0x1.71F75Ep+0f, 0x1.6A09E6p+0f); + const __m512 vtable_lo = _mm512_set_ps( + 0x1.6247ECp+0f, 0x1.5AB07Ep+0f, 0x1.5342B6p+0f, 0x1.4BFDAEp+0f, + 0x1.44E086p+0f, 0x1.3DEA64p+0f, 0x1.371A74p+0f, 0x1.306FE0p+0f, + 0x1.29E9E0p+0f, 0x1.2387A6p+0f, 0x1.1D4874p+0f, 0x1.172B84p+0f, + 0x1.11301Ep+0f, 0x1.0B5586p+0f, 0x1.059B0Ep+0f, 0x1.000000p+0f); + + for (; n != 0; n -= 16 * sizeof(float)) { + const __m512 vx = _mm512_loadu_ps(input); + + // Compute reduced argument n := round(x / log(2), 5). + // We do it by adding a large number (magic bias), which cause rounding of result to 5 fractional bits, then + // subtracing the large number back. The first addition is combined with multiplication by log2e into a single FMA + // instruction. The trick with adding large number is valid only within certain bounds (|x| <= 2**17), but thats + // ok, because inputs outside of [-103.97207, 88.72283] underflow or overflow expf(x) anyway. We fixup the result + // for such inputs at the very end of the algorithm. + __m512 vn = _mm512_fmadd_ps(vx, vlog2e, vmagic_bias); + + // Use the low 5 bits of n (as integer) for table lookup. + const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi); + + // Subtract the large number back to get final n := round(x / log(2), 5). + vn = _mm512_sub_ps(vn, vmagic_bias); + + // Compute reduced argument t := x - n * log(2). + // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. + __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx); + vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt); + + // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/64, log(2)/64]. + // p = l * (1 + t * (c1 + t * c2)) + // = l + l * t * (c1 + t * c2) + __m512 vp = _mm512_fmadd_ps(vt, vc2, vc1); + vt = _mm512_mul_ps(vt, vl); + vp = _mm512_fmadd_ps(vt, vp, vl); + + // Reconstruct the final value as f = exp2(floor(n)) * p. + const __m512 vf = _mm512_scalef_ps(vp, vn); + _mm512_storeu_ps(output, vf); + + input += 16; + output += 16; + } +} diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h index ecfb29fba..dc7ef65cf 100644 --- a/src/xnnpack/math-stubs.h +++ b/src/xnnpack/math-stubs.h @@ -93,7 +93,9 @@ DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx2_lut8_p4_perm) DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_p5) DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_p5_scalef) DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut16_p3_perm) +DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut16_p3_perm_scalef) DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut32_p2_perm2) +DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut32_p2_perm2_scalef) DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_expminus__neonfma_p5) DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_expminus__neonfma_lut64_p2) |