From 6fca4de48eaa110ada3ca849f445ca3754fb869a Mon Sep 17 00:00:00 2001 From: Wan-Teh Chang Date: Tue, 5 Dec 2023 14:29:37 -0800 Subject: Remove SSE code for 128x* blocks The maximum block size is 64x64 in VP9. Bug: webm:1819 Change-Id: If9802be9f81b51dbcdbc8a68d5afe48ca6d3d0e7 (cherry picked from commit c4c92080545970899488ab27944792a95c7131a2) --- vpx_dsp/arm/highbd_sse_neon.c | 51 ------------------------------------------ vpx_dsp/arm/sse_neon.c | 24 -------------------- vpx_dsp/arm/sse_neon_dotprod.c | 26 --------------------- vpx_dsp/x86/sse_avx2.c | 34 ---------------------------- vpx_dsp/x86/sse_sse4.c | 47 -------------------------------------- 5 files changed, 182 deletions(-) diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c index 717ad6b19..ee76bed58 100644 --- a/vpx_dsp/arm/highbd_sse_neon.c +++ b/vpx_dsp/arm/highbd_sse_neon.c @@ -42,55 +42,6 @@ static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); } -static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int height) { - uint32x4_t sse[16]; - highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); - highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); - - src += src_stride; - ref += ref_stride; - - while (--height != 0) { - highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); - highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); - highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); - highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); - highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); - highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); - highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); - highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); - highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); - - src += src_stride; - ref += ref_stride; - } - - return horizontal_long_add_uint32x4_x16(sse); -} - static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { @@ -279,8 +230,6 @@ int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride, return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); case 64: return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); - case 128: - return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height); default: return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c index 0b4a6e504..f686dc350 100644 --- a/vpx_dsp/arm/sse_neon.c +++ b/vpx_dsp/arm/sse_neon.c @@ -84,29 +84,6 @@ static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, return horizontal_add_uint32x4(sse); } -static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int height) { - uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = height; - do { - sse_16x1_neon(src, ref, &sse[0]); - sse_16x1_neon(src + 16, ref + 16, &sse[1]); - sse_16x1_neon(src + 32, ref + 32, &sse[0]); - sse_16x1_neon(src + 48, ref + 48, &sse[1]); - sse_16x1_neon(src + 64, ref + 64, &sse[0]); - sse_16x1_neon(src + 80, ref + 80, &sse[1]); - sse_16x1_neon(src + 96, ref + 96, &sse[0]); - sse_16x1_neon(src + 112, ref + 112, &sse[1]); - - src += src_stride; - ref += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); -} - static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { @@ -203,7 +180,6 @@ int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); - case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); default: return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); } diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c index 0f11b7cbb..877777391 100644 --- a/vpx_dsp/arm/sse_neon_dotprod.c +++ b/vpx_dsp/arm/sse_neon_dotprod.c @@ -85,30 +85,6 @@ static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); } -static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src, - int src_stride, - const uint8_t *ref, - int ref_stride, int height) { - uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - int i = height; - do { - sse_16x1_neon_dotprod(src, ref, &sse[0]); - sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); - sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); - sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); - sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]); - sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]); - sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]); - sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]); - - src += src_stride; - ref += ref_stride; - } while (--i != 0); - - return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); -} - static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { @@ -214,8 +190,6 @@ int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 64: return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); - case 128: - return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height); default: return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, height); diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c index 975446775..917ff0ef1 100644 --- a/vpx_dsp/x86/sse_avx2.c +++ b/vpx_dsp/x86/sse_avx2.c @@ -168,18 +168,6 @@ int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, } while (y < height); sse = summary_all_avx2(&sum); break; - case 128: - do { - sse_w32_avx2(&sum, a, b); - sse_w32_avx2(&sum, a + 32, b + 32); - sse_w32_avx2(&sum, a + 64, b + 64); - sse_w32_avx2(&sum, a + 96, b + 96); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; default: if ((width & 0x07) == 0) { do { @@ -333,28 +321,6 @@ int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, } while (y < height); sse = summary_4x64_avx2(sum); break; - case 128: - do { - int l = 0; - __m256i sum32 = _mm256_setzero_si256(); - do { - highbd_sse_w16_avx2(&sum32, a, b); - highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); - highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); - highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); - highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); - highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); - highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); - highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - l += 1; - } while (l < 16 && l < (height - y)); - summary_32_avx2(&sum32, &sum); - y += 16; - } while (y < height); - sse = summary_4x64_avx2(sum); - break; default: if (width & 0x7) { do { diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c index 1c2744e2f..4a7585c57 100644 --- a/vpx_dsp/x86/sse_sse4.c +++ b/vpx_dsp/x86/sse_sse4.c @@ -128,22 +128,6 @@ int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, } while (y < height); sse = summary_all_sse4(&sum); break; - case 128: - do { - sse_w16_sse4_1(&sum, a, b); - sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); - sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); - sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); - sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); - sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); - sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); - sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; default: if (width & 0x07) { do { @@ -285,37 +269,6 @@ int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, _mm_storel_epi64((__m128i *)&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; - case 128: - do { - int l = 0; - __m128i sum32 = _mm_setzero_si128(); - do { - highbd_sse_w8_sse4_1(&sum32, a, b); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); - highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); - a += a_stride; - b += b_stride; - l += 1; - } while (l < 8 && l < (height - y)); - summary_32_sse4(&sum32, &sum); - y += 8; - } while (y < height); - _mm_storel_epi64((__m128i *)&sse, - _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); - break; default: if (width & 0x7) { do { -- cgit v1.2.3