diff options
Diffstat (limited to 'fixedpoint/fixedpoint_neon.h')
-rw-r--r-- | fixedpoint/fixedpoint_neon.h | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/fixedpoint/fixedpoint_neon.h b/fixedpoint/fixedpoint_neon.h index 8b23de2..92b349b 100644 --- a/fixedpoint/fixedpoint_neon.h +++ b/fixedpoint/fixedpoint_neon.h @@ -29,97 +29,194 @@ struct FixedPointRawTypeTraits<int32x4_t> { }; template <> +struct FixedPointRawTypeTraits<int16x8_t> { + typedef std::int16_t ScalarRawType; + static const int kLanes = 8; +}; + +template <> inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) { return vandq_s32(a, b); } template <> +inline int16x8_t BitAnd(int16x8_t a, int16x8_t b) { + return vandq_s16(a, b); +} + +template <> inline int32x4_t BitOr(int32x4_t a, int32x4_t b) { return vorrq_s32(a, b); } template <> +inline int16x8_t BitOr(int16x8_t a, int16x8_t b) { + return vorrq_s16(a, b); +} + +template <> inline int32x4_t BitXor(int32x4_t a, int32x4_t b) { return veorq_s32(a, b); } template <> +inline int16x8_t BitXor(int16x8_t a, int16x8_t b) { + return veorq_s16(a, b); +} + +template <> inline int32x4_t BitNot(int32x4_t a) { return veorq_s32(a, vdupq_n_s32(-1)); } template <> +inline int16x8_t BitNot(int16x8_t a) { + return veorq_s16(a, vdupq_n_s16(-1)); +} + +template <> inline int32x4_t Add(int32x4_t a, int32x4_t b) { return vaddq_s32(a, b); } template <> +inline int16x8_t Add(int16x8_t a, int16x8_t b) { + return vaddq_s16(a, b); +} + +template <> inline int32x4_t Sub(int32x4_t a, int32x4_t b) { return vsubq_s32(a, b); } template <> +inline int16x8_t Sub(int16x8_t a, int16x8_t b) { + return vsubq_s16(a, b); +} + +template <> inline int32x4_t Neg(int32x4_t a) { return vnegq_s32(a); } template <> +inline int16x8_t Neg(int16x8_t a) { + return vnegq_s16(a); +} + +template <> inline int32x4_t ShiftLeft(int32x4_t a, int offset) { return vshlq_s32(a, vdupq_n_s32(offset)); } template <> +inline int16x8_t ShiftLeft(int16x8_t a, int offset) { + return vshlq_s16(a, vdupq_n_s16(offset)); +} + +template <> inline int32x4_t ShiftRight(int32x4_t a, int offset) { return vshlq_s32(a, vdupq_n_s32(-offset)); } template <> +inline int16x8_t ShiftRight(int16x8_t a, int offset) { + return vshlq_s16(a, vdupq_n_s16(-offset)); +} + +template <> inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val, int32x4_t else_val) { return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val); } template <> +inline int16x8_t SelectUsingMask(int16x8_t if_mask, int16x8_t then_val, + int16x8_t else_val) { + return vbslq_s16(vreinterpretq_u16_s16(if_mask), then_val, else_val); +} + +template <> inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) { return vreinterpretq_s32_u32(vceqq_s32(a, b)); } template <> +inline int16x8_t MaskIfEqual(int16x8_t a, int16x8_t b) { + return vreinterpretq_s16_u16(vceqq_s16(a, b)); +} + +template <> inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) { return BitNot(MaskIfEqual(a, b)); } template <> +inline int16x8_t MaskIfNotEqual(int16x8_t a, int16x8_t b) { + return BitNot(MaskIfEqual(a, b)); +} + +template <> inline int32x4_t MaskIfZero(int32x4_t a) { return MaskIfEqual(a, vdupq_n_s32(0)); } template <> +inline int16x8_t MaskIfZero(int16x8_t a) { + return MaskIfEqual(a, vdupq_n_s16(0)); +} + +template <> inline int32x4_t MaskIfNonZero(int32x4_t a) { return vreinterpretq_s32_u32(vtstq_s32(a, a)); } template <> +inline int16x8_t MaskIfNonZero(int16x8_t a) { + return vreinterpretq_s16_u16(vtstq_s16(a, a)); +} + +template <> inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) { return vreinterpretq_s32_u32(vcgtq_s32(a, b)); } template <> +inline int16x8_t MaskIfGreaterThan(int16x8_t a, int16x8_t b) { + return vreinterpretq_s16_u16(vcgtq_s16(a, b)); +} + +template <> inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) { return vreinterpretq_s32_u32(vcgeq_s32(a, b)); } template <> +inline int16x8_t MaskIfGreaterThanOrEqual(int16x8_t a, int16x8_t b) { + return vreinterpretq_s16_u16(vcgeq_s16(a, b)); +} + +template <> inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) { return vreinterpretq_s32_u32(vcltq_s32(a, b)); } template <> +inline int16x8_t MaskIfLessThan(int16x8_t a, int16x8_t b) { + return vreinterpretq_s16_u16(vcltq_s16(a, b)); +} + +template <> inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) { return vreinterpretq_s32_u32(vcleq_s32(a, b)); } template <> +inline int16x8_t MaskIfLessThanOrEqual(int16x8_t a, int16x8_t b) { + return vreinterpretq_s16_u16(vcleq_s16(a, b)); +} + +template <> inline bool All(int32x4_t a) { a = vandq_s32(a, vextq_s32(a, a, 1)); a = vandq_s32(a, vextq_s32(a, a, 2)); @@ -127,6 +224,14 @@ inline bool All(int32x4_t a) { } template <> +inline bool All(int16x8_t a) { + a = vandq_s16(a, vextq_s16(a, a, 1)); + a = vandq_s16(a, vextq_s16(a, a, 2)); + a = vandq_s16(a, vextq_s16(a, a, 4)); + return vgetq_lane_s16(a, 0); +} + +template <> inline bool Any(int32x4_t a) { a = vorrq_s32(a, vextq_s32(a, a, 1)); a = vorrq_s32(a, vextq_s32(a, a, 2)); @@ -134,16 +239,34 @@ inline bool Any(int32x4_t a) { } template <> +inline bool Any(int16x8_t a) { + a = vorrq_s16(a, vextq_s16(a, a, 1)); + a = vorrq_s16(a, vextq_s16(a, a, 2)); + a = vorrq_s16(a, vextq_s16(a, a, 4)); + return vgetq_lane_s16(a, 0); +} + +template <> inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) { return vrhaddq_s32(a, b); } template <> +inline int16x8_t RoundingHalfSum(int16x8_t a, int16x8_t b) { + return vrhaddq_s16(a, b); +} + +template <> inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } template <> +inline int16x8_t SaturatingRoundingDoublingHighMul(int16x8_t a, int16x8_t b) { + return vqrdmulhq_s16(a, b); +} + +template <> inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) { const int32x4_t shift_vec = vdupq_n_s32(-exponent); const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); @@ -151,6 +274,14 @@ inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) { return vrshlq_s32(fixed_up_x, shift_vec); } +template <> +inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) { + const int16x8_t shift_vec = vdupq_n_s16(-exponent); + const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15); + const int16x8_t fixed_up_x = vqaddq_s16(x, fixup); + return vrshlq_s16(fixed_up_x, shift_vec); +} + template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> { static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); } @@ -165,11 +296,36 @@ struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> { } }; +template <int Exponent> +struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, 1> { + static int16x8_t eval(int16x8_t x) { return vqshlq_n_s16(x, Exponent); } +}; + +template <int Exponent> +struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, -1> { + static int16x8_t eval(int16x8_t x) { + const int16x8_t fixup = vshrq_n_s16(x, 15); + const int16x8_t fixed_up_x = vqaddq_s16(x, fixup); + return vrshrq_n_s16(fixed_up_x, -Exponent); + } +}; + template <> inline int32x4_t Dup<int32x4_t>(std::int32_t x) { return vdupq_n_s32(x); } +template <> +inline int16x8_t Dup<int16x8_t>(std::int16_t x) { + return vdupq_n_s16(x); +} + +// So far this is only needed for int16. +template <> +inline int16x8_t SaturatingAdd(int16x8_t a, int16x8_t b) { + return vqaddq_s16(a, b); +} + } // end namespace gemmlowp #endif // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ |