aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWan-Teh Chang <wtc@google.com>2023-12-05 14:29:37 -0800
committerJames Zern <jzern@google.com>2023-12-08 00:34:23 +0000
commit6fca4de48eaa110ada3ca849f445ca3754fb869a (patch)
tree9e829fdf374f382b8b3b82b3d44d1535396cbaee
parent0d5811e4effb62672ec60aed5bd21553af158791 (diff)
downloadlibvpx-6fca4de48eaa110ada3ca849f445ca3754fb869a.tar.gz
Remove SSE code for 128x* blocks
The maximum block size is 64x64 in VP9. Bug: webm:1819 Change-Id: If9802be9f81b51dbcdbc8a68d5afe48ca6d3d0e7 (cherry picked from commit c4c92080545970899488ab27944792a95c7131a2)
-rw-r--r--vpx_dsp/arm/highbd_sse_neon.c51
-rw-r--r--vpx_dsp/arm/sse_neon.c24
-rw-r--r--vpx_dsp/arm/sse_neon_dotprod.c26
-rw-r--r--vpx_dsp/x86/sse_avx2.c34
-rw-r--r--vpx_dsp/x86/sse_sse4.c47
5 files changed, 0 insertions, 182 deletions
diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c
index 717ad6b19..ee76bed58 100644
--- a/vpx_dsp/arm/highbd_sse_neon.c
+++ b/vpx_dsp/arm/highbd_sse_neon.c
@@ -42,55 +42,6 @@ static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
*sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
}
-static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- int height) {
- uint32x4_t sse[16];
- highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
- highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
-
- src += src_stride;
- ref += ref_stride;
-
- while (--height != 0) {
- highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
- highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
- highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
- highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
- highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
- highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
- highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
- highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
- highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
-
- src += src_stride;
- ref += ref_stride;
- }
-
- return horizontal_long_add_uint32x4_x16(sse);
-}
-
static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int height) {
@@ -279,8 +230,6 @@ int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride,
return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
case 64:
return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
- case 128:
- return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
default:
return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
height);
diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c
index 0b4a6e504..f686dc350 100644
--- a/vpx_dsp/arm/sse_neon.c
+++ b/vpx_dsp/arm/sse_neon.c
@@ -84,29 +84,6 @@ static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
return horizontal_add_uint32x4(sse);
}
-static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- int height) {
- uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = height;
- do {
- sse_16x1_neon(src, ref, &sse[0]);
- sse_16x1_neon(src + 16, ref + 16, &sse[1]);
- sse_16x1_neon(src + 32, ref + 32, &sse[0]);
- sse_16x1_neon(src + 48, ref + 48, &sse[1]);
- sse_16x1_neon(src + 64, ref + 64, &sse[0]);
- sse_16x1_neon(src + 80, ref + 80, &sse[1]);
- sse_16x1_neon(src + 96, ref + 96, &sse[0]);
- sse_16x1_neon(src + 112, ref + 112, &sse[1]);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
-}
-
static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int height) {
@@ -203,7 +180,6 @@ int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
- case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
default:
return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
}
diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c
index 0f11b7cbb..877777391 100644
--- a/vpx_dsp/arm/sse_neon_dotprod.c
+++ b/vpx_dsp/arm/sse_neon_dotprod.c
@@ -85,30 +85,6 @@ static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
}
-static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
- int src_stride,
- const uint8_t *ref,
- int ref_stride, int height) {
- uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = height;
- do {
- sse_16x1_neon_dotprod(src, ref, &sse[0]);
- sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
- sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
- sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
- sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
- sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
- sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
- sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
-
- src += src_stride;
- ref += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
-}
-
static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int height) {
@@ -214,8 +190,6 @@ int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride,
return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
case 64:
return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
- case 128:
- return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
default:
return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
height);
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
index 975446775..917ff0ef1 100644
--- a/vpx_dsp/x86/sse_avx2.c
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -168,18 +168,6 @@ int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
} while (y < height);
sse = summary_all_avx2(&sum);
break;
- case 128:
- do {
- sse_w32_avx2(&sum, a, b);
- sse_w32_avx2(&sum, a + 32, b + 32);
- sse_w32_avx2(&sum, a + 64, b + 64);
- sse_w32_avx2(&sum, a + 96, b + 96);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
default:
if ((width & 0x07) == 0) {
do {
@@ -333,28 +321,6 @@ int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
} while (y < height);
sse = summary_4x64_avx2(sum);
break;
- case 128:
- do {
- int l = 0;
- __m256i sum32 = _mm256_setzero_si256();
- do {
- highbd_sse_w16_avx2(&sum32, a, b);
- highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
- highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
- highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
- highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
- highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
- highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
- highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
- a += a_stride;
- b += b_stride;
- l += 1;
- } while (l < 16 && l < (height - y));
- summary_32_avx2(&sum32, &sum);
- y += 16;
- } while (y < height);
- sse = summary_4x64_avx2(sum);
- break;
default:
if (width & 0x7) {
do {
diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c
index 1c2744e2f..4a7585c57 100644
--- a/vpx_dsp/x86/sse_sse4.c
+++ b/vpx_dsp/x86/sse_sse4.c
@@ -128,22 +128,6 @@ int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
} while (y < height);
sse = summary_all_sse4(&sum);
break;
- case 128:
- do {
- sse_w16_sse4_1(&sum, a, b);
- sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
- sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
- sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
- sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
- sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
- sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
- sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
default:
if (width & 0x07) {
do {
@@ -285,37 +269,6 @@ int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
_mm_storel_epi64((__m128i *)&sse,
_mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
break;
- case 128:
- do {
- int l = 0;
- __m128i sum32 = _mm_setzero_si128();
- do {
- highbd_sse_w8_sse4_1(&sum32, a, b);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
- highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
- a += a_stride;
- b += b_stride;
- l += 1;
- } while (l < 8 && l < (height - y));
- summary_32_sse4(&sum32, &sum);
- y += 8;
- } while (y < height);
- _mm_storel_epi64((__m128i *)&sse,
- _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
- break;
default:
if (width & 0x7) {
do {