diff options
author | Salome Thirot <salome.thirot@arm.com> | 2023-01-19 18:02:52 +0000 |
---|---|---|
committer | Salome Thirot <salome.thirot@arm.com> | 2023-01-23 12:03:20 +0000 |
commit | fcfb471ce2a413e760bdff805c5ae66778cb4169 (patch) | |
tree | c106a7169e167ba53e80781f2369bad913305ecb | |
parent | b7c22b3a9584d7d9c0a7b9b37a52bc595113b398 (diff) | |
download | libvpx-fcfb471ce2a413e760bdff805c5ae66778cb4169.tar.gz |
Refactor Neon subpel variance functions
Refactor the Neon implementation of the sub-pixel variance bilinear
filter helper functions - effectively backporting this libaom patch[1].
[1] https://aomedia-review.googlesource.com/c/aom/+/162462
Change-Id: I3dee32e8125250bbeffeb63d1fef5da559bacbf1
-rw-r--r-- | vpx_dsp/arm/subpel_variance_neon.c | 271 |
1 files changed, 135 insertions, 136 deletions
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index a3befdc34..3fb0acd54 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -17,156 +17,155 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/arm/mem_neon.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - // Process a block exactly 4 wide and a multiple of 2 high. -static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; i += 2) { - const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); - const uint8x8_t src_1 = - load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += 2 * src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); } // Process a block exactly 8 wide and any height. -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); } // Process a block which is a mutiple of 16 wide and any height. -static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i, j; - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 16) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); - } - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3); + uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3); + vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi)); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); } -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -SUB_PIXEL_VARIANCENXM(4, 4) -SUB_PIXEL_VARIANCENXM(4, 8) -SUB_PIXEL_VARIANCENXM(8, 4) -SUB_PIXEL_VARIANCENXM(8, 8) -SUB_PIXEL_VARIANCENXM(8, 16) -SUB_PIXEL_VARIANCENXM(16, 8) -SUB_PIXEL_VARIANCENXM(16, 16) -SUB_PIXEL_VARIANCENXM(16, 32) -SUB_PIXEL_VARIANCENXM(32, 16) -SUB_PIXEL_VARIANCENXM(32, 32) -SUB_PIXEL_VARIANCENXM(32, 64) -SUB_PIXEL_VARIANCENXM(64, 32) -SUB_PIXEL_VARIANCENXM(64, 64) +// 4x<h> blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ +#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ + uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ + x_offset); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, y_offset); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ + x_offset); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, y_offset); \ + } else { \ + var_filter_block2d_bil_large(src_ptr, temp0, src_stride, 1, n, (m + 1), \ + x_offset); \ + var_filter_block2d_bil_large(temp0, temp1, n, n, n, m, y_offset); \ + } \ + \ + vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ + \ + return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ } SUB_PIXEL_AVG_VARIANCENXM(4, 4) |