summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZVictoria <victoria.zhislina@intel.com>2018-06-20 20:26:40 +0300
committerZVictoria <victoria.zhislina@intel.com>2018-06-20 20:26:40 +0300
commita332e45af06be51d2c34059d80e792a6db4643b5 (patch)
treee892c5910678f8f3f6026869e25c99805ad16e14
parente19d71749ab5060b31d8187107a000450aa9b205 (diff)
downloadneon_2_sse-a332e45af06be51d2c34059d80e792a6db4643b5.tar.gz
tiny performance improvement for vcge(q)_u16 (SSSE3),vcgt(q)_u16,vcgt(q)_u8
-rw-r--r--NEON_2_SSE.h49
1 files changed, 20 insertions, 29 deletions
diff --git a/NEON_2_SSE.h b/NEON_2_SSE.h
index 26ea519..7908508 100644
--- a/NEON_2_SSE.h
+++ b/NEON_2_SSE.h
@@ -5221,13 +5221,12 @@ _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0
cmp = _mm_max_epu16(a, b);
return _mm_cmpeq_epi16(cmp, a); //a>=b
#else
- __m128i c8000, as, bs, m1, m2;
- c8000 = _mm_set1_epi16 ((int16_t)0x8000);
- as = _mm_sub_epi16(a,c8000);
- bs = _mm_sub_epi16(b,c8000);
- m1 = _mm_cmpgt_epi16(as, bs);
- m2 = _mm_cmpeq_epi16 (as, bs);
- return _mm_or_si128 ( m1, m2);
+ __m128i as, mask;
+ __m128i zero = _mm_setzero_si128();
+ __m128i cffff = _mm_set1_epi16(0xffff);
+ as = _mm_subs_epu16(b,a);
+ mask = _mm_cmpgt_epi16(as, zero);
+ return _mm_xor_si128 ( mask, cffff);
#endif
}
@@ -5457,22 +5456,20 @@ uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
{
//no unsigned chars comparison, only signed available,so need the trick
- __m128i c128, as, bs;
- c128 = _mm_set1_epi8 ((int8_t)128);
- as = _mm_sub_epi8(a,c128);
- bs = _mm_sub_epi8(b,c128);
- return _mm_cmpgt_epi8 (as, bs);
+ __m128i as, mask;
+ __m128i zero = _mm_setzero_si128();
+ as = _mm_subs_epu8(a, b);
+ return _mm_cmpgt_epi8(as, zero);
}
uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
{
//no unsigned short comparison, only signed available,so need the trick
- __m128i c8000, as, bs;
- c8000 = _mm_set1_epi16 ((int16_t)0x8000);
- as = _mm_sub_epi16(a,c8000);
- bs = _mm_sub_epi16(b,c8000);
- return _mm_cmpgt_epi16 ( as, bs);
+ __m128i as, mask;
+ __m128i zero = _mm_setzero_si128();
+ as = _mm_subs_epu16(a, b);
+ return _mm_cmpgt_epi16(as, zero);
}
uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
@@ -5826,24 +5823,18 @@ _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0
uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
{
- __m128i cmp, difab, difba;
- cmp = vcgtq_u8(a,b);
- difab = _mm_sub_epi8(a,b);
- difba = _mm_sub_epi8 (b,a);
- difab = _mm_and_si128(cmp, difab);
- difba = _mm_andnot_si128(cmp, difba);
+ __m128i difab, difba;
+ difab = _mm_subs_epu8(a,b);
+ difba = _mm_subs_epu8 (b,a);
return _mm_or_si128(difab, difba);
}
uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
{
- __m128i cmp, difab, difba;
- cmp = vcgtq_u16(a,b);
- difab = _mm_sub_epi16(a,b);
- difba = _mm_sub_epi16 (b,a);
- difab = _mm_and_si128(cmp, difab);
- difba = _mm_andnot_si128(cmp, difba);
+ __m128i difab, difba;
+ difab = _mm_subs_epu16(a,b);
+ difba = _mm_subs_epu16 (b,a);
return _mm_or_si128(difab, difba);
}