From 074e1a55d6e9f3ede5b482f2ecba51e48776ab49 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 22 Nov 2017 16:54:13 -0800 Subject: Add some __aarch64__ functions --- NEON_2_SSE.h | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/NEON_2_SSE.h b/NEON_2_SSE.h index 1252bd1..61dcaad 100644 --- a/NEON_2_SSE.h +++ b/NEON_2_SSE.h @@ -62,6 +62,7 @@ #include //SSE4.2 #endif +#include //*************** functions and data attributes, compiler dependent ********************************* //*********************************************************************************** @@ -150,6 +151,11 @@ typedef __m128 float32x4_t; typedef __m128 float16x4_t; //not supported by IA, for compartibility typedef __m128 float16x8_t; //not supported by IA, for compartibility +#ifdef _NEON2SSE_64BIT +typedef __m64_128 float64x1_t; +typedef __m128d float64x2_t; +#endif + typedef __m128i int8x16_t; typedef __m128i int16x8_t; typedef __m128i int32x4_t; @@ -173,6 +179,9 @@ typedef float float32_t; #if !defined(__clang__) typedef float __fp16; #endif +#ifdef _NEON2SSE_64BIT +typedef double float64_t; +#endif typedef uint8_t poly8_t; typedef uint16_t poly16_t; @@ -861,6 +870,9 @@ uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 +#ifdef _NEON2SSE_64BIT +float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0 +#endif //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 @@ -876,6 +888,9 @@ uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 +#ifdef _NEON2SSE_64BIT +float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0 +#endif //Pairwise addition //Pairwise add int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 @@ -1225,6 +1240,9 @@ float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0 float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] +#ifdef _NEON2SSE_64BIT +float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0] +#endif //Load a single lane from memory uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] @@ -1755,6 +1773,7 @@ int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 +int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0 //Convert to float float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 @@ -2003,6 +2022,10 @@ int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 +#ifdef _NEON2SSE_64BIT +int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0 +float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0 +#endif //Saturating absolute: Vd[i] = sat(|Va[i]|) int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 @@ -2245,6 +2268,15 @@ uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 +//Sqrt +float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0 +#ifdef _NEON2SSE_64BIT +float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0 +#endif +float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0 +#ifdef _NEON2SSE_64BIT +float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0 +#endif //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -6137,6 +6169,11 @@ uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 #define vmaxq_f32 _mm_max_ps +#ifdef _NEON2SSE_64BIT +float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0 +#define vmaxq_f64 _mm_max_pd +#endif + //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** //*********************************************************************************************************** int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 @@ -6221,6 +6258,11 @@ uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 #define vminq_f32 _mm_min_ps +#ifdef _NEON2SSE_64BIT +float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0 +#define vminq_f64 _mm_min_pd +#endif + //************* Pairwise addition operations. ************************************** //************************************************************************************ //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector @@ -9288,6 +9330,14 @@ poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] #define vld1_p16 vld1_u16 +#ifdef _NEON2SSE_64BIT +float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0] +_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr) +{ + return _mm_loadu_pd(ptr); +} +#endif + //*********************************************************************************************************** //******* Lane load functions - insert the data at vector's given position (lane) ************************* //*********************************************************************************************************** @@ -12725,6 +12775,13 @@ _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) in return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); } + +int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0 +_NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a) +{ + return _mm_cvtps_epi32(a); +} + //***************** Convert to float ************************* //************************************************************* float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 @@ -14562,6 +14619,22 @@ _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0 return _mm_and_ps (a, *(__m128*)c7fffffff); } +#ifdef _NEON2SSE_64BIT +int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0 +_NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0 +{ + __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31); + return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign); +} + +float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0 +_NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0 +{ + _NEON2SSE_ALIGN_16 int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL}; + return _mm_and_pd (a, *(__m128d*)mask); +} +#endif + //****** Saturating absolute: Vd[i] = sat(|Va[i]|) ********************* //********************************************************************** //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place @@ -16589,4 +16662,44 @@ uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); #define vreinterpretq_u32_p8 +//************* Round ****************** +float32x4_t vrndnq_f32(float32x4_t a); +_NEON2SSE_INLINE float32x4_t vrndnq_f32(float32x4_t a) +{ +#ifdef USE_SSE4 + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + return (__m128){nearbyintf(a[0]), nearbyintf(a[1]), + nearbyintf(a[2]), nearbyintf(a[3])}; +#endif +} + +#ifdef _NEON2SSE_64BIT +float64x2_t vrndnq_f64(float64x2_t a); +_NEON2SSE_INLINE float64x2_t vrndnq_f64(float64x2_t a) +{ +#ifdef USE_SSE4 + return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + return (__m128d){nearbyint(a[0]), nearbyint(a[1])}; +#endif +} +#endif + +//************* Sqrt ****************** + +float32x4_t vsqrtq_f32(float32x4_t a); +_NEON2SSE_INLINE float32x4_t vsqrtq_f32(float32x4_t a) +{ + return _mm_sqrt_ps(a); +} + +#ifdef _NEON2SSE_64BIT +float64x2_t vsqrtq_f64(float64x2_t a); +_NEON2SSE_INLINE float64x2_t vsqrtq_f64(float64x2_t a) +{ + return _mm_sqrt_pd(a); +} +#endif + #endif /* NEON2SSE_H */ -- cgit v1.2.3