Specialize Neon subpel variance by filter value for large blocks

The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes (>= 16x16) as we need to be doing enough work to make the cost of finding the optimal implementation worth it. This is a backport of this libaom change[1]. After this change, the only differences between the code in libvpx and libaom are due to libvpx being compiled with ISO C90, which forbids mixing declarations and code [-Wdeclaration-after-statement]. [1] https://aomedia-review.googlesource.com/c/aom/+/162463 Change-Id: Ia818e148f6fd126656e8411d59c184b55dd43094
author: Salome Thirot <salome.thirot@arm.com> 2023-01-20 10:35:34 +0000
committer: Salome Thirot <salome.thirot@arm.com> 2023-01-23 13:11:59 +0000
commit: ae5b60cb4730639fc7742df577600ce71ddb5936 (patch)
tree: 7ee84c38b9f55440322effee7d1311c7b8f6cd85
parent: fcfb471ce2a413e760bdff805c5ae66778cb4169 (diff)
download: libvpx-ae5b60cb4730639fc7742df577600ce71ddb5936.tar.gz
1 files changed, 87 insertions, 8 deletions
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 3fb0acd54..60650b703 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -107,6 +107,30 @@ static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
                                dst_height, filter_offset);
 }
 
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
   unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
@@ -119,6 +143,61 @@ static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
+    if (xoffset == 0) {                                                       \
+      if (yoffset == 0) {                                                     \
+        return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      } else {                                                                \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
+                                    yoffset);                                 \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      }                                                                       \
+    } else if (xoffset == 4) {                                                \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    } else {                                                                  \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    }                                                                         \
+  }
+
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
 // padding.
 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
@@ -128,16 +207,16 @@ SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
-SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
-SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
-SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
author	Salome Thirot <salome.thirot@arm.com>	2023-01-20 10:35:34 +0000
committer	Salome Thirot <salome.thirot@arm.com>	2023-01-23 13:11:59 +0000
commit	ae5b60cb4730639fc7742df577600ce71ddb5936 (patch)
tree	7ee84c38b9f55440322effee7d1311c7b8f6cd85
parent	fcfb471ce2a413e760bdff805c5ae66778cb4169 (diff)
download	libvpx-ae5b60cb4730639fc7742df577600ce71ddb5936.tar.gz