math: add vector pow

This implementation is a wrapper around the scalar pow with appropriate call abi. As such it is not expected to be faster than scalar calls, the new double prec vector pow symbols are provided for completeness.
author: Szabolcs Nagy <szabolcs.nagy@arm.com> 2020-01-10 15:10:45 +0000
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2020-01-14 12:47:48 +0000
commit: a807c9bba198cc89ddd6cb177442e1e297c935b3 (patch)
tree: 2d8578dfdc4d6d45257a4518f8244396e876cd7b /math
parent: 099350affd8bd6eebf9d1e067b102530740b7270 (diff)
download: arm-optimized-routines-a807c9bba198cc89ddd6cb177442e1e297c935b3.tar.gz
8 files changed, 129 insertions, 0 deletions
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 254954a..4493008 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -36,6 +36,7 @@ double __s_sin (double);
 double __s_cos (double);
 double __s_exp (double);
 double __s_log (double);
+double __s_pow (double, double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
@@ -61,6 +62,7 @@ __f64x2_t __v_sin (__f64x2_t);
 __f64x2_t __v_cos (__f64x2_t);
 __f64x2_t __v_exp (__f64x2_t);
 __f64x2_t __v_log (__f64x2_t);
+__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -78,6 +80,7 @@ __vpcs __f64x2_t __vn_sin (__f64x2_t);
 __vpcs __f64x2_t __vn_cos (__f64x2_t);
 __vpcs __f64x2_t __vn_exp (__f64x2_t);
 __vpcs __f64x2_t __vn_log (__f64x2_t);
+__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
@@ -90,6 +93,7 @@ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
 #endif
 #endif
 
diff --git a/math/s_pow.c b/math/s_pow.c
new file mode 100644
index 0000000..2e34c9f
--- /dev/null
+++ b/math/s_pow.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_pow.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 8d3ff1d..33ceda3 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -128,6 +128,18 @@ xy_Z_powf (v_float x)
 {
   return _ZGVnN4vv_powf (x, x);
 }
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+  return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
+}
 #endif
 
 static v_float
@@ -135,6 +147,12 @@ xy__v_powf (v_float x)
 {
   return __v_powf (x, x);
 }
+
+static v_double
+xy__v_pow (v_double x)
+{
+  return __v_pow (x, x);
+}
 #endif
 
 static float
@@ -142,6 +160,12 @@ xy__s_powf (float x)
 {
   return __s_powf (x, x);
 }
+
+static double
+xy__s_pow (double x)
+{
+  return __s_pow (x, x);
+}
 #endif
 
 static double
@@ -256,6 +280,7 @@ D (__s_sin, -3.1, 3.1)
 D (__s_cos, -3.1, 3.1)
 D (__s_exp, -9.9, 9.9)
 D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
 F (__s_expf, -9.9, 9.9)
 F (__s_expf_1u, -9.9, 9.9)
 F (__s_exp2f, -9.9, 9.9)
@@ -270,6 +295,7 @@ VD (__v_sin, -3.1, 3.1)
 VD (__v_cos, -3.1, 3.1)
 VD (__v_exp, -9.9, 9.9)
 VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
 VF (__v_dummyf, 1.0, 2.0)
 VF (__v_expf, -9.9, 9.9)
 VF (__v_expf_1u, -9.9, 9.9)
@@ -285,6 +311,8 @@ VND (__vn_exp, -9.9, 9.9)
 VND (_ZGVnN2v_exp, -9.9, 9.9)
 VND (__vn_log, 0.01, 11.1)
 VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
 VND (__vn_sin, -3.1, 3.1)
 VND (_ZGVnN2v_sin, -3.1, 3.1)
 VND (__vn_cos, -3.1, 3.1)
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 44393b8..ea524ca 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -110,6 +110,15 @@ range_exp='
  -633.3     -777.3     10000
 '
 
+range_pow='
+ 0x1p-1   0x1p1  x  0x1p-10 0x1p10   50000
+ 0x1p-1   0x1p1  x -0x1p-10 -0x1p10  50000
+ 0x1p-500 0x1p500  x  0x1p-1 0x1p1   50000
+ 0x1p-500 0x1p500  x  -0x1p-1 -0x1p1 50000
+ 0x1.ep-1 0x1.1p0 x  0x1p8 0x1p16    50000
+ 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16   50000
+'
+
 range_expf='
   0    0xffff0000    10000
   0x1p-14   0x1p8    500000
@@ -143,6 +152,7 @@ range_powf='
 
 # error limits
 L_exp=1.9
+L_pow=0.05
 L_expf=1.49
 L_expf_1u=0.4
 L_exp2f=1.49
@@ -173,6 +183,11 @@ exp  __v_exp       $runv
 exp  __vn_exp      $runvn
 exp  _ZGVnN2v_exp  $runvn
 
+pow __s_pow       $runs
+pow __v_pow       $runv
+pow __vn_pow      $runvn
+pow _ZGVnN2vv_pow $runvn
+
 expf __s_expf      $runs
 expf __v_expf      $runv
 expf __vn_expf     $runvn
diff --git a/math/test/ulp.c b/math/test/ulp.c
index b746080..444bbca 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -240,6 +240,7 @@ static double v_sin(double x) { return __v_sin(argd(x))[0]; }
 static double v_cos(double x) { return __v_cos(argd(x))[0]; }
 static double v_exp(double x) { return __v_exp(argd(x))[0]; }
 static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
 #ifdef __vpcs
 static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
 static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
@@ -253,6 +254,7 @@ static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
 static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
 static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
 static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
 static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
 static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
 static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
@@ -263,6 +265,7 @@ static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
 static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
 static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
 static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
 #endif
 #endif
 
@@ -334,6 +337,7 @@ static const struct fun fun[] = {
  F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
  F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
  F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
 #if __aarch64__
  F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
  F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
@@ -347,6 +351,7 @@ static const struct fun fun[] = {
  F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
  F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
  F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
 #ifdef __vpcs
  F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
  F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
@@ -360,6 +365,7 @@ static const struct fun fun[] = {
  F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
  F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
  F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
  F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
  F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
  F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
@@ -370,6 +376,7 @@ static const struct fun fun[] = {
  F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
  F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
  F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
 #endif
 #endif
 #endif
diff --git a/math/v_math.h b/math/v_math.h
index 0861e98..3db22e5 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -249,6 +249,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
   return f (x1, x2);
 }
 
+static inline int
+v_lanes64 (void)
+{
+  return 1;
+}
 static inline v_f64_t
 v_f64 (f64_t x)
 {
@@ -264,6 +269,16 @@ v_s64 (s64_t x)
 {
   return x;
 }
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  *x = v;
+}
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
@@ -506,6 +521,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
 	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
 }
 
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
 static inline v_f64_t
 v_f64 (f64_t x)
 {
@@ -521,6 +541,16 @@ v_s64 (s64_t x)
 {
   return (v_s64_t){x, x};
 }
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  (*x)[i] = v;
+}
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
diff --git a/math/v_pow.c b/math/v_pow.c
new file mode 100644
index 0000000..a209d57
--- /dev/null
+++ b/math/v_pow.c
@@ -0,0 +1,27 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+VPCS_ATTR
+v_f64_t
+V_NAME(pow) (v_f64_t x, v_f64_t y)
+{
+  v_f64_t z;
+  for (int lane = 0; lane < v_lanes64 (); lane++)
+    {
+      f64_t sx = v_get_f64 (x, lane);
+      f64_t sy = v_get_f64 (y, lane);
+      f64_t sz = pow (sx, sy);
+      v_set_f64 (&z, lane, sz);
+    }
+  return z;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
new file mode 100644
index 0000000..2609501
--- /dev/null
+++ b/math/vn_pow.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_pow.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
+#include "v_pow.c"
+#endif
author	Szabolcs Nagy <szabolcs.nagy@arm.com>	2020-01-10 15:10:45 +0000
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2020-01-14 12:47:48 +0000
commit	a807c9bba198cc89ddd6cb177442e1e297c935b3 (patch)
tree	2d8578dfdc4d6d45257a4518f8244396e876cd7b /math
parent	099350affd8bd6eebf9d1e067b102530740b7270 (diff)
download	arm-optimized-routines-a807c9bba198cc89ddd6cb177442e1e297c935b3.tar.gz