faster vp8_regular_quantize_b_sse4_1

Gives 10% faster VP8 encoding in simple tests. This patch requires testing on wider datasets and encoder settings to see if this speedup is achieved on most data. Change-Id: If8e04819623e78fff126c413db66c964c0b4c11a
author: Ilya Kurdyukov <jpegqs@gmail.com> 2021-11-13 18:22:14 +0700
committer: Ilya Kurdyukov <jpegqs@gmail.com> 2021-11-17 04:07:50 +0000
commit: 4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1 (patch)
tree: 8fb73137864fa9a90085edd8b39f68c3f5774096
parent: 2d2637547d7ee5f89cb1f6bfd1956b5ad8e29b77 (diff)
download: libvpx-4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1.tar.gz
2 files changed, 78 insertions, 40 deletions
diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c
index 389c16705..6d03365fc 100644
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -11,28 +11,14 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
 #include "vp8/encoder/block.h"
-
-#define SELECT_EOB(i, z, x, y, q)                         \
-  do {                                                    \
-    short boost = *zbin_boost_ptr;                        \
-    /* Technically _mm_extract_epi16() returns an int: */ \
-    /* https://bugs.llvm.org/show_bug.cgi?id=41657 */     \
-    short x_z = (short)_mm_extract_epi16(x, z);           \
-    short y_z = (short)_mm_extract_epi16(y, z);           \
-    int cmp = (x_z < boost) | (y_z == 0);                 \
-    zbin_boost_ptr++;                                     \
-    if (cmp) break;                                       \
-    q = _mm_insert_epi16(q, y_z, z);                      \
-    eob = i;                                              \
-    zbin_boost_ptr = b->zrun_zbin_boost;                  \
-  } while (0)
+#include "vpx_ports/bitops.h" /* get_lsb */
 
 void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
-  char eob = 0;
+  int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
-
+  __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
+  __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8));
   __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
@@ -47,8 +33,12 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
-  __m128i qcoeff0 = _mm_setzero_si128();
-  __m128i qcoeff1 = _mm_setzero_si128();
+  __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1;
+  uint32_t mask, ymask;
+  DECLARE_ALIGNED(16, static const uint8_t,
+                  zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                        9, 12, 13, 10, 7, 11, 14, 15 };
+  DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 };
 
   /* Duplicate to all lanes. */
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
@@ -88,23 +78,52 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   y0 = _mm_sign_epi16(y0, z0);
   y1 = _mm_sign_epi16(y1, z1);
 
-  /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
-  SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+  {
+    const __m128i zig_zag_i16_0 =
+        _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13);
+    const __m128i zig_zag_i16_1 =
+        _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13);
+
+    /* The first part of the zig zag needs a value
+     * from x_minus_zbin1 and vice versa. */
+    t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2);
+    t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80);
+    t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80);
+    x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0);
+    x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1);
+  }
+
+  /* Check if y is nonzero and put it in zig zag order. */
+  t0 = _mm_packs_epi16(y0, y1);
+  t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128());
+  t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask));
+  ymask = _mm_movemask_epi8(t0) ^ 0xffff;
+
+  for (;;) {
+    t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0);
+    t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1);
+    t0 = _mm_packs_epi16(t0, t1);
+    mask = _mm_movemask_epi8(t0);
+    mask = ~mask & ymask;
+    if (!mask) break;
+    /* |eob| will contain the index of the next found element where:
+     * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */
+    eob = get_lsb(mask);
+    /* Need to clear the mask from processed elements so that
+     * they are no longer counted in the next iteration. */
+    ymask &= ~1U << eob;
+    /* It's safe to read ahead of this buffer if struct VP8_COMP has at
+     * least 32 bytes before the zrun_zbin_boost_* fields (it has 384).
+     * Any data read outside of the buffer is masked by the updated |ymask|. */
+    zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1));
+    zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7));
+    qcoeff[zig_zag_mask[eob]] = 0xffff;
+  }
+
+  qcoeff0 = _mm_load_si128((__m128i *)(qcoeff));
+  qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8));
+  qcoeff0 = _mm_and_si128(qcoeff0, y0);
+  qcoeff1 = _mm_and_si128(qcoeff1, y1);
 
   _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
   _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
@@ -115,5 +134,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
   _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
 
-  *d->eob = eob;
+  *d->eob = eob + 1;
 }
diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h
index 5b2f31cd1..1b5cdaa6d 100644
--- a/vpx_ports/bitops.h
+++ b/vpx_ports/bitops.h
@@ -26,20 +26,32 @@
 extern "C" {
 #endif
 
-// These versions of get_msb() are only valid when n != 0 because all
-// of the optimized versions are undefined when n == 0:
+// These versions of get_lsb() and get_msb() are only valid when n != 0
+// because all of the optimized versions are undefined when n == 0:
 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_lsb(unsigned int n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
 static INLINE int get_msb(unsigned int n) {
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanForward)
 #pragma intrinsic(_BitScanReverse)
 
+static INLINE int get_lsb(unsigned int n) {
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
   assert(n != 0);
@@ -48,6 +60,13 @@ static INLINE int get_msb(unsigned int n) {
 }
 #undef USE_MSC_INTRINSICS
 #else
+static INLINE int get_lsb(unsigned int n) {
+  int i;
+  assert(n != 0);
+  for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1;
+  return i;
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
   int log = 0;
author	Ilya Kurdyukov <jpegqs@gmail.com>	2021-11-13 18:22:14 +0700
committer	Ilya Kurdyukov <jpegqs@gmail.com>	2021-11-17 04:07:50 +0000
commit	4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1 (patch)
tree	8fb73137864fa9a90085edd8b39f68c3f5774096
parent	2d2637547d7ee5f89cb1f6bfd1956b5ad8e29b77 (diff)
download	libvpx-4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1.tar.gz