aboutsummaryrefslogtreecommitdiff
path: root/math/include/mathlib.h
diff options
context:
space:
mode:
authorSzabolcs Nagy <szabolcs.nagy@arm.com>2019-08-09 16:24:59 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2019-10-14 11:58:53 +0100
commitba75d0a0d8235119eef1305f0d9a2142fa4b11e0 (patch)
tree2e5fe02ff810134efff2ad534b0a8c2e61e075c0 /math/include/mathlib.h
parentc5cba8528da13fe0d647dbd0f80d0cf21434b224 (diff)
downloadarm-optimized-routines-ba75d0a0d8235119eef1305f0d9a2142fa4b11e0.tar.gz
Add vector powf
Essentially the scalar powf algorithm is used for each element in the vector just inlined for better scheduling and simpler special case handling. The log polynomial is smaller as less accuracy is enough. Worst-case error is 2.6 ulp.
Diffstat (limited to 'math/include/mathlib.h')
-rw-r--r--math/include/mathlib.h4
1 files changed, 4 insertions, 0 deletions
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 405cf4a..1788502 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -29,6 +29,7 @@ float __s_cosf (float);
float __s_expf (float);
float __s_expf_1u (float);
float __s_logf (float);
+float __s_powf (float, float);
double __s_exp (double);
#if __aarch64__
@@ -48,6 +49,7 @@ __f32x4_t __v_cosf (__f32x4_t);
__f32x4_t __v_expf (__f32x4_t);
__f32x4_t __v_expf_1u (__f32x4_t);
__f32x4_t __v_logf (__f32x4_t);
+__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
__f64x2_t __v_exp (__f64x2_t);
#if __GNUC__ >= 9 || __clang_major__ >= 8
@@ -59,6 +61,7 @@ __vpcs __f32x4_t __vn_cosf (__f32x4_t);
__vpcs __f32x4_t __vn_expf (__f32x4_t);
__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
__vpcs __f32x4_t __vn_logf (__f32x4_t);
+__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t __vn_exp (__f64x2_t);
/* Vector functions following the vector PCS using ABI names. */
@@ -66,6 +69,7 @@ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
#endif
#endif