aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2020-09-21 17:00:27 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2020-09-21 17:00:58 -0700
commit80a8ac59849bfdae8d2e1409f5642baa502c0b9e (patch)
tree2c7f222fd4eb33aabf81fe34fb2b5a5e145fd66d
parent9490613d1da6d21e7d03908800f9437a9375bfd0 (diff)
downloadXNNPACK-80a8ac59849bfdae8d2e1409f5642baa502c0b9e.tar.gz
AVX512F LUT-based with SCALEF Exp evaluation stubs
PiperOrigin-RevId: 332967630
-rw-r--r--BUILD.bazel2
-rwxr-xr-xCMakeLists.txt4
-rw-r--r--eval/f32-exp.cc8
-rw-r--r--src/math/exp-avx512f-lut16-p3-perm-scalef.c71
-rw-r--r--src/math/exp-avx512f-lut32-p2-perm2-scalef.c75
-rw-r--r--src/xnnpack/math-stubs.h2
6 files changed, 161 insertions, 1 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 70bb15ead..f63f01292 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2618,7 +2618,9 @@ AVX512F_UKERNELS = [
"src/math/exp-avx512f-p5-scalef.c",
"src/math/exp-avx512f-p5.c",
"src/math/exp-avx512f-lut16-p3-perm.c",
+ "src/math/exp-avx512f-lut16-p3-perm-scalef.c",
"src/math/exp-avx512f-lut32-p2-perm2.c",
+ "src/math/exp-avx512f-lut32-p2-perm2-scalef.c",
"src/math/extexp-avx512f-p5.c",
"src/math/sigmoid-avx512f-rr1-p5-div.c",
"src/math/sigmoid-avx512f-rr1-p5-nr1fma.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b761a06d8..a47087bf8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2092,10 +2092,12 @@ SET(XNNPACK_AVX512F_MICROKERNEL_SRCS
src/f32-vunary/gen/vneg-avx512f-x32.c
src/f32-vunary/gen/vsqr-avx512f-x16.c
src/f32-vunary/gen/vsqr-avx512f-x32.c
- src/math/exp-avx512f-p5-scalef.c
src/math/exp-avx512f-p5.c
+ src/math/exp-avx512f-p5-scalef.c
src/math/exp-avx512f-lut16-p3-perm.c
+ src/math/exp-avx512f-lut16-p3-perm-scalef.c
src/math/exp-avx512f-lut32-p2-perm2.c
+ src/math/exp-avx512f-lut32-p2-perm2-scalef.c
src/math/extexp-avx512f-p5.c
src/math/sigmoid-avx512f-rr1-p5-div.c
src/math/sigmoid-avx512f-rr1-p5-nr1fma.c
diff --git a/eval/f32-exp.cc b/eval/f32-exp.cc
index b61918032..dccb6912d 100644
--- a/eval/f32-exp.cc
+++ b/eval/f32-exp.cc
@@ -92,9 +92,15 @@ static void ExpError(benchmark::State& state,
static void f32_exp__avx512f_lut16_p3_perm(benchmark::State& state) {
ExpError(state, xnn_math_f32_exp__avx512f_lut16_p3_perm, 16);
}
+ static void f32_exp__avx512f_lut16_p3_perm_scalef(benchmark::State& state) {
+ ExpError(state, xnn_math_f32_exp__avx512f_lut16_p3_perm_scalef, 16);
+ }
static void f32_exp__avx512f_lut32_p2_perm2(benchmark::State& state) {
ExpError(state, xnn_math_f32_exp__avx512f_lut32_p2_perm2, 16);
}
+ static void f32_exp__avx512f_lut32_p2_perm2_scalef(benchmark::State& state) {
+ ExpError(state, xnn_math_f32_exp__avx512f_lut32_p2_perm2_scalef, 16);
+ }
static void f32_exp__avx512f_p5(benchmark::State& state) {
ExpError(state, xnn_math_f32_exp__avx512f_p5, 16);
}
@@ -109,7 +115,9 @@ static void ExpError(benchmark::State& state,
BENCHMARK(f32_exp__avx2_lut8_p3_perm)->Unit(benchmark::kMillisecond)->Iterations(1);
BENCHMARK(f32_exp__avx2_p5)->Unit(benchmark::kMillisecond)->Iterations(1);
BENCHMARK(f32_exp__avx512f_lut16_p3_perm)->Unit(benchmark::kMillisecond)->Iterations(1);
+ BENCHMARK(f32_exp__avx512f_lut16_p3_perm_scalef)->Unit(benchmark::kMillisecond)->Iterations(1);
BENCHMARK(f32_exp__avx512f_lut32_p2_perm2)->Unit(benchmark::kMillisecond)->Iterations(1);
+ BENCHMARK(f32_exp__avx512f_lut32_p2_perm2_scalef)->Unit(benchmark::kMillisecond)->Iterations(1);
BENCHMARK(f32_exp__avx512f_p5)->Unit(benchmark::kMillisecond)->Iterations(1);
BENCHMARK(f32_exp__avx512f_p5_scalef)->Unit(benchmark::kMillisecond)->Iterations(1);
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/src/math/exp-avx512f-lut16-p3-perm-scalef.c b/src/math/exp-avx512f-lut16-p3-perm-scalef.c
new file mode 100644
index 000000000..07eb5a9ac
--- /dev/null
+++ b/src/math/exp-avx512f-lut16-p3-perm-scalef.c
@@ -0,0 +1,71 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_exp__avx512f_lut16_p3_perm_scalef(
+ size_t n,
+ const float* input,
+ float* output)
+{
+ assert(n % (16 * sizeof(float)) == 0);
+
+ const __m512 vmagic_bias = _mm512_set1_ps(0x1.800000p19f);
+ const __m512 vlog2e = _mm512_set1_ps(0x1.715476p0f);
+ const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62e43p-1f);
+ const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05c61p-29f);
+
+ const __m512 vc2 = _mm512_set1_ps(0x1.00021Ep-1f);
+ const __m512 vc3 = _mm512_set1_ps(0x1.55559Ap-3f);
+ const __m512 vtable = _mm512_set_ps(
+ 0x1.EA4AFAp+0f, 0x1.D5818Ep+0f, 0x1.C199BEp+0f, 0x1.AE89FAp+0f,
+ 0x1.9C4918p+0f, 0x1.8ACE54p+0f, 0x1.7A1148p+0f, 0x1.6A09E6p+0f,
+ 0x1.5AB07Ep+0f, 0x1.4BFDAEp+0f, 0x1.3DEA64p+0f, 0x1.306FE0p+0f,
+ 0x1.2387A6p+0f, 0x1.172B84p+0f, 0x1.0B5586p+0f, 0x1.000000p+0f);
+
+ for (; n != 0; n -= 16 * sizeof(float)) {
+ const __m512 vx = _mm512_loadu_ps(input);
+
+ // Compute reduced argument n := round(x / log(2), 4).
+ // We do it by adding a large number (magic bias), which cause rounding of result to an 4 fractional bits, then
+ // subtracing the large number back. The first addition is combined with multiplication by log2e into a single
+ // FMA instruction. The trick with adding large number is valid only within certain bounds (|x| <= 2**18), but
+ // thats ok, because inputs outside of [-103.97207, 88.72283] underflow or overflow expf(x) anyway. We fixup
+ // the result for such inputs at the very end of the algorithm.
+ __m512 vn = _mm512_fmadd_ps(vx, vlog2e, vmagic_bias);
+
+ // Use the low 4 bits of n (as integer) for table lookup.
+ const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable);
+
+ // Subtract the large number back to get final n := round(x / log(2), 4).
+ vn = _mm512_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - n * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
+ vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
+
+ // Compute degree-3 polynomial approximation for exp(t) on [-log(2)/32, log(2)/32].
+ // P = l * (1 + t * (1 + t * (c2 + t * c3)))
+ // = l + l * (t + t * (t * (c2 + t * c3)))
+ __m512 vp = _mm512_fmadd_ps(vt, vc3, vc2);
+ vp = _mm512_mul_ps(vp, vt);
+ vp = _mm512_fmadd_ps(vt, vp, vt);
+ vp = _mm512_fmadd_ps(vl, vp, vl);
+
+ // Reconstruct the final value as f = exp2(floor(n)) * p.
+ const __m512 vf = _mm512_scalef_ps(vp, vn);
+ _mm512_storeu_ps(output, vf);
+
+ input += 16;
+ output += 16;
+ }
+}
diff --git a/src/math/exp-avx512f-lut32-p2-perm2-scalef.c b/src/math/exp-avx512f-lut32-p2-perm2-scalef.c
new file mode 100644
index 000000000..562343992
--- /dev/null
+++ b/src/math/exp-avx512f-lut32-p2-perm2-scalef.c
@@ -0,0 +1,75 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_exp__avx512f_lut32_p2_perm2_scalef(
+ size_t n,
+ const float* input,
+ float* output)
+{
+ assert(n % (16 * sizeof(float)) == 0);
+
+ const __m512 vmagic_bias = _mm512_set1_ps(0x1.800000p18f);
+ const __m512 vlog2e = _mm512_set1_ps(0x1.715476p0f);
+ const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62e43p-1f);
+ const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05c61p-29f);
+
+ const __m512 vc1 = _mm512_set1_ps(0x1.0000F6p-0f);
+ const __m512 vc2 = _mm512_set1_ps(0x1.000000p-1f);
+ const __m512 vtable_hi = _mm512_set_ps(
+ 0x1.F50766p+0f, 0x1.EA4AFAp+0f, 0x1.DFC974p+0f, 0x1.D5818Ep+0f,
+ 0x1.CB720Ep+0f, 0x1.C199BEp+0f, 0x1.B7F770p+0f, 0x1.AE89FAp+0f,
+ 0x1.A5503Cp+0f, 0x1.9C4918p+0f, 0x1.93737Cp+0f, 0x1.8ACE54p+0f,
+ 0x1.82589Ap+0f, 0x1.7A1148p+0f, 0x1.71F75Ep+0f, 0x1.6A09E6p+0f);
+ const __m512 vtable_lo = _mm512_set_ps(
+ 0x1.6247ECp+0f, 0x1.5AB07Ep+0f, 0x1.5342B6p+0f, 0x1.4BFDAEp+0f,
+ 0x1.44E086p+0f, 0x1.3DEA64p+0f, 0x1.371A74p+0f, 0x1.306FE0p+0f,
+ 0x1.29E9E0p+0f, 0x1.2387A6p+0f, 0x1.1D4874p+0f, 0x1.172B84p+0f,
+ 0x1.11301Ep+0f, 0x1.0B5586p+0f, 0x1.059B0Ep+0f, 0x1.000000p+0f);
+
+ for (; n != 0; n -= 16 * sizeof(float)) {
+ const __m512 vx = _mm512_loadu_ps(input);
+
+ // Compute reduced argument n := round(x / log(2), 5).
+ // We do it by adding a large number (magic bias), which cause rounding of result to 5 fractional bits, then
+ // subtracing the large number back. The first addition is combined with multiplication by log2e into a single FMA
+ // instruction. The trick with adding large number is valid only within certain bounds (|x| <= 2**17), but thats
+ // ok, because inputs outside of [-103.97207, 88.72283] underflow or overflow expf(x) anyway. We fixup the result
+ // for such inputs at the very end of the algorithm.
+ __m512 vn = _mm512_fmadd_ps(vx, vlog2e, vmagic_bias);
+
+ // Use the low 5 bits of n (as integer) for table lookup.
+ const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi);
+
+ // Subtract the large number back to get final n := round(x / log(2), 5).
+ vn = _mm512_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - n * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
+ vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
+
+ // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/64, log(2)/64].
+ // p = l * (1 + t * (c1 + t * c2))
+ // = l + l * t * (c1 + t * c2)
+ __m512 vp = _mm512_fmadd_ps(vt, vc2, vc1);
+ vt = _mm512_mul_ps(vt, vl);
+ vp = _mm512_fmadd_ps(vt, vp, vl);
+
+ // Reconstruct the final value as f = exp2(floor(n)) * p.
+ const __m512 vf = _mm512_scalef_ps(vp, vn);
+ _mm512_storeu_ps(output, vf);
+
+ input += 16;
+ output += 16;
+ }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index ecfb29fba..dc7ef65cf 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -93,7 +93,9 @@ DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx2_lut8_p4_perm)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_p5)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_p5_scalef)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut16_p3_perm)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut16_p3_perm_scalef)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut32_p2_perm2)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_lut32_p2_perm2_scalef)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_expminus__neonfma_p5)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_expminus__neonfma_lut64_p2)