diff options
Diffstat (limited to 'vpx_dsp/arm/sad_neon.c')
-rw-r--r-- | vpx_dsp/arm/sad_neon.c | 896 |
1 files changed, 318 insertions, 578 deletions
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index ad575d4aa..4dd87ddc0 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -17,635 +17,375 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); -#if defined(__ARM_FEATURE_DOTPROD) - const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); - const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); - return horizontal_add_uint32x4(dp); -#else - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - return horizontal_add_uint16x8(abs); -#endif -} +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; + uint8x16_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); -uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); -#if defined(__ARM_FEATURE_DOTPROD) - const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); - const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); - return horizontal_add_uint32x4(prod); -#else - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - return horizontal_add_uint16x8(abs); -#endif -} + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); -uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { -#if defined(__ARM_FEATURE_DOTPROD) - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t src2_u8 = - load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); - const uint8x16_t ref2_u8 = - load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); - const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8); - const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8); - prod = vdotq_u32(prod, sad1_u8, ones); - prod = vdotq_u32(prod, sad2_u8, ones); - return horizontal_add_uint32x4(prod); -#else - int i; - uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; i += 4) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - } + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); - return horizontal_add_uint16x8(abs); -#endif + return horizontal_add_uint32x4(sum_u32); } -uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { -#if defined(__ARM_FEATURE_DOTPROD) - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t src2_u8 = - load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); - const uint8x16_t ref2_u8 = - load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); - const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred); - const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16); - const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8); - const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8); - const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1); - const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2); - prod = vdotq_u32(prod, sad1_u8, ones); - prod = vdotq_u32(prod, sad2_u8, ones); - return horizontal_add_uint32x4(prod); -#else - int i; - uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; i += 4) { - const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - second_pred += 16; - abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); - } +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint32x4_t sum = vdupq_n_u32(0); - return horizontal_add_uint16x8(abs); -#endif -} + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t diff0 = vabdq_u8(s0, r0); + uint16x8_t sum0 = vpaddlq_u8(diff0); -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x2_t prod = vdup_n_u32(0); - const uint8x8_t ones = vdup_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdot_u32(prod, sad_u8, ones); - } - return prod; -} + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t diff1 = vabdq_u8(s1, r1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); -static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x2_t prod = vdup_n_u32(0); - const uint8x8_t ones = vdup_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t c_u8 = vld1_u8(second_pred); - const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); - const uint8x8_t sad_u8 = vabd_u8(a_u8, avg); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 8; - prod = vdot_u32(prod, sad_u8, ones); - } - return prod; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); } -#define SAD8XN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x2_t prod = \ - sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x2(prod); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x2_t prod = \ - sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x2(prod); \ - } +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - src_ptr += src_stride; - ref_ptr += ref_stride; - abs = vabal_u8(abs, a_u8, b_u8); - } - return abs; -} + uint8x16_t diff = vabdq_u8(s, r); + sum = vpadalq_u8(sum, diff); -static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(src_ptr); - const uint8x8_t b_u8 = vld1_u8(ref_ptr); - const uint8x8_t c_u8 = vld1_u8(second_pred); - const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 8; - abs = vabal_u8(abs, a_u8, avg); - } - return abs; -} + } while (--i != 0); -#define SAD8XN(n) \ - uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) - -SAD8XN(4) -SAD8XN(8) -SAD8XN(16) - -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t src_u8 = vld1q_u8(src_ptr); - const uint8x16_t ref_u8 = vld1q_u8(ref_ptr); - const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_u8, ones); - } - return prod; + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - const uint8x16_t c_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); - const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg); - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 16; - prod = vdotq_u32(prod, sad_u8, ones); - } - return prod; -} +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); -#define SAD16XN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t prod = \ - sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(prod); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t prod = \ - sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(prod); \ - } -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - src_ptr += src_stride; - ref_ptr += ref_stride; - abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); - abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); - } - return abs; -} + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + + sum = vabal_u8(sum, s, r); -static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr); - const uint8x16_t c_u8 = vld1q_u8(second_pred); - const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 16; - abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); - abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); - } - return abs; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#define SAD16XN(n) \ - uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = \ - sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) - -SAD16XN(8) -SAD16XN(16) -SAD16XN(32) - -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo); - const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_lo_u8, ones); - prod = vdotq_u32(prod, sad_hi_u8, ones); - } - return prod; +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + + sum = vabal_u8(sum, s, r); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t c_lo = vld1q_u8(second_pred); - const uint8x16_t c_hi = vld1q_u8(second_pred + 16); - const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); - const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); - const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo); - const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi); - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 32; - prod = vdotq_u32(prod, sad_lo_u8, ones); - prod = vdotq_u32(prod, sad_hi_u8, ones); +#define SAD_WXH_NEON(w, h) \ + unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } - return prod; -} -#define SAD32XN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t prod = \ - sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(prod); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t prod = \ - sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(prod); \ +SAD_WXH_NEON(4, 4) +SAD_WXH_NEON(4, 8) + +SAD_WXH_NEON(8, 4) +SAD_WXH_NEON(8, 8) +SAD_WXH_NEON(8, 16) + +SAD_WXH_NEON(16, 8) +SAD_WXH_NEON(16, 16) +SAD_WXH_NEON(16, 32) + +SAD_WXH_NEON(32, 16) +SAD_WXH_NEON(32, 32) +SAD_WXH_NEON(32, 64) + +SAD_WXH_NEON(64, 32) +SAD_WXH_NEON(64, 64) + +#undef SAD_WXH_NEON + +#define SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ } -#else // defined(__ARM_FEATURE_DOTPROD) -static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); +SAD_SKIP_WXH_NEON(4, 4) +SAD_SKIP_WXH_NEON(4, 8) + +SAD_SKIP_WXH_NEON(8, 4) +SAD_SKIP_WXH_NEON(8, 8) +SAD_SKIP_WXH_NEON(8, 16) + +SAD_SKIP_WXH_NEON(16, 8) +SAD_SKIP_WXH_NEON(16, 16) +SAD_SKIP_WXH_NEON(16, 32) + +SAD_SKIP_WXH_NEON(32, 16) +SAD_SKIP_WXH_NEON(32, 32) +SAD_SKIP_WXH_NEON(32, 64) + +SAD_SKIP_WXH_NEON(64, 32) +SAD_SKIP_WXH_NEON(64, 64) + +#undef SAD_SKIP_WXH_NEON + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + src_ptr += src_stride; ref_ptr += ref_stride; - abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); - abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); - abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); - abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); - } - return abs; + second_pred += 64; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); } -static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_lo = vld1q_u8(src_ptr); - const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); - const uint8x16_t b_lo = vld1q_u8(ref_ptr); - const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); - const uint8x16_t c_lo = vld1q_u8(second_pred); - const uint8x16_t c_hi = vld1q_u8(second_pred + 16); - const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); - const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t avg0 = vrhaddq_u8(r0, p0); + uint8x16_t diff0 = vabdq_u8(s0, avg0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t avg1 = vrhaddq_u8(r1, p1); + uint8x16_t diff1 = vabdq_u8(s1, avg1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 32; - abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); - abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); - abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); - abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); - } - return abs; -} + } while (--i != 0); -#define SAD32XN(n) \ - uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint16x8_t abs = \ - sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint16x8(abs); \ - } \ - \ - uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint16x8_t abs = \ - sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint16x8(abs); \ - } -#endif // defined(__ARM_FEATURE_DOTPROD) - -SAD32XN(16) -SAD32XN(32) -SAD32XN(64) - -#if defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0); - const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1); - const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2); - const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3); - src_ptr += src_stride; - ref_ptr += ref_stride; - prod = vdotq_u32(prod, sad_0_u8, ones); - prod = vdotq_u32(prod, sad_1_u8, ones); - prod = vdotq_u32(prod, sad_2_u8, ones); - prod = vdotq_u32(prod, sad_3_u8, ones); - } - return prod; + return horizontal_add_uint32x4(sum); } -static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint32x4_t prod = vdupq_n_u32(0); - const uint8x16_t ones = vdupq_n_u8(1); - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t c_0 = vld1q_u8(second_pred); - const uint8x16_t c_1 = vld1q_u8(second_pred + 16); - const uint8x16_t c_2 = vld1q_u8(second_pred + 32); - const uint8x16_t c_3 = vld1q_u8(second_pred + 48); - const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); - const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); - const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); - const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0); - const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1); - const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2); - const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3); +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(r, p); + uint8x16_t diff = vabdq_u8(s, avg); + sum = vpadalq_u8(sum, diff); + src_ptr += src_stride; ref_ptr += ref_stride; - second_pred += 64; - prod = vdotq_u32(prod, sad_0_u8, ones); - prod = vdotq_u32(prod, sad_1_u8, ones); - prod = vdotq_u32(prod, sad_2_u8, ones); - prod = vdotq_u32(prod, sad_3_u8, ones); - } - return prod; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#else // !defined(__ARM_FEATURE_DOTPROD) -static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int height) { - int i; - uint16x8_t abs_0 = vdupq_n_u16(0); - uint16x8_t abs_1 = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + +static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + src_ptr += src_stride; ref_ptr += ref_stride; - abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); - abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); - } + second_pred += 8; + } while (--i != 0); - { - const uint32x4_t sum = vpaddlq_u16(abs_0); - return vpadalq_u16(sum, abs_1); - } + return horizontal_add_uint16x8(sum); } -static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const int height) { - int i; - uint16x8_t abs_0 = vdupq_n_u16(0); - uint16x8_t abs_1 = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(src_ptr); - const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); - const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); - const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); - const uint8x16_t b_0 = vld1q_u8(ref_ptr); - const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); - const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); - const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); - const uint8x16_t c_0 = vld1q_u8(second_pred); - const uint8x16_t c_1 = vld1q_u8(second_pred + 16); - const uint8x16_t c_2 = vld1q_u8(second_pred + 32); - const uint8x16_t c_3 = vld1q_u8(second_pred + 48); - const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); - const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); - const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); - const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); - src_ptr += src_stride; - ref_ptr += ref_stride; - second_pred += 64; - abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); - abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); - abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); - abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); - abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); - } +static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); - { - const uint32x4_t sum = vpaddlq_u16(abs_0); - return vpadalq_u16(sum, abs_1); - } + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); } -#endif // defined(__ARM_FEATURE_DOTPROD) - -#define SAD64XN(n) \ - uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - const uint32x4_t abs = \ - sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ - return horizontal_add_uint32x4(abs); \ - } \ - \ - uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred) { \ - const uint32x4_t abs = \ - sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ - return horizontal_add_uint32x4(abs); \ + +#define SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } -SAD64XN(32) -SAD64XN(64) +SAD_WXH_AVG_NEON(4, 4) +SAD_WXH_AVG_NEON(4, 8) + +SAD_WXH_AVG_NEON(8, 4) +SAD_WXH_AVG_NEON(8, 8) +SAD_WXH_AVG_NEON(8, 16) + +SAD_WXH_AVG_NEON(16, 8) +SAD_WXH_AVG_NEON(16, 16) +SAD_WXH_AVG_NEON(16, 32) + +SAD_WXH_AVG_NEON(32, 16) +SAD_WXH_AVG_NEON(32, 32) +SAD_WXH_AVG_NEON(32, 64) + +SAD_WXH_AVG_NEON(64, 32) +SAD_WXH_AVG_NEON(64, 64) + +#undef SAD_WXH_AVG_NEON |