1 files changed, 855 insertions, 0 deletions
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
new file mode 100644
index 0000000..a8fa091
--- /dev/null
+++ b/pl/math/v_math.h
@@ -0,0 +1,855 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+# define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+/* The goal of this header is to allow vector (only Neon for now)
+   and scalar build of the same algorithm. */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2.  */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+  union { f32_t f; u32_t u; } r = {x};
+  return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+  union { u32_t u; f32_t f; } r = {x};
+  return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+  union { u32_t u; s32_t i; } r = {x};
+  return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+  union { s32_t i; u32_t u; } r = {x};
+  return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+  union { f64_t f; u64_t u; } r = {x};
+  return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+  union { u64_t u; f64_t f; } r = {x};
+  return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+  union { u64_t u; s64_t i; } r = {x};
+  return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+  union { s64_t i; u64_t u; } r = {x};
+  return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return __builtin_fabsf (x);
+}
+static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) > fabsf (y);
+}
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) >= fabsf (y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) < fabsf (y);
+}
+static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return x / y;
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
+}
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return p ? x : y;
+}
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return __builtin_sqrtf (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return f (x1, x2);
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 1;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return x;
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  *x = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  return x != 0;
+}
+/* true if all elements of a v_cond result is non-zero.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  return x;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return __builtin_fabs (x);
+}
+static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return fabs (x) > fabs (y);
+}
+static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return x / y;
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return x < y ? x : y;
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return __builtin_round (x);
+}
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return __builtin_sqrt (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return __builtin_lround (x); /* relies on -fno-math-errno.  */
+}
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return __builtin_trunc (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return x;
+}
+
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return f (x);
+}
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return f (x1, x2);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include <arm_neon.h>
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return vabsq_f32 (x);
+}
+static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (m, x, y);
+}
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcagtq_f32 (x, y);
+}
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcageq_f32 (x, y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcaltq_f32 (x, y);
+}
+static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return vdivq_f32 (x, y);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return vcvtaq_s32_f32 (x);
+}
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (p, x, y);
+}
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return vsqrtq_f32 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return vcvtq_s32_f32 (x);
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return (
+    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return (v_s64_t){x, x};
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  (*x)[i] = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+/* true if all elements of a v_cond result is 1.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return vabsq_f64 (x);
+}
+static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return vbslq_u64 (m, x, y);
+}
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return vcagtq_f64 (x, y);
+}
+static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return vdivq_f64 (x, y);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return vminq_f64(x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return vrndaq_f64 (x);
+}
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return vsqrtq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return vcvtaq_s64_f64 (x);
+}
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return vcvtq_u64_f64 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return vcvtq_s64_f64 (x);
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union {  v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0],
+		   p[1] ? f (x1[1], x2[1]) : y[1]};
+}
+#endif
+
+#endif
+#endif