diff options
author | Ilya Kurdyukov <jpegqs@gmail.com> | 2021-11-13 18:22:14 +0700 |
---|---|---|
committer | Ilya Kurdyukov <jpegqs@gmail.com> | 2021-11-17 04:07:50 +0000 |
commit | 4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1 (patch) | |
tree | 8fb73137864fa9a90085edd8b39f68c3f5774096 | |
parent | 2d2637547d7ee5f89cb1f6bfd1956b5ad8e29b77 (diff) | |
download | libvpx-4a5a0a9a795782f33b8eb04461f9a5dfc9a146e1.tar.gz |
faster vp8_regular_quantize_b_sse4_1
Gives 10% faster VP8 encoding in simple tests.
This patch requires testing on wider datasets and encoder
settings to see if this speedup is achieved on most data.
Change-Id: If8e04819623e78fff126c413db66c964c0b4c11a
-rw-r--r-- | vp8/encoder/x86/quantize_sse4.c | 95 | ||||
-rw-r--r-- | vpx_ports/bitops.h | 23 |
2 files changed, 78 insertions, 40 deletions
diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c index 389c16705..6d03365fc 100644 --- a/vp8/encoder/x86/quantize_sse4.c +++ b/vp8/encoder/x86/quantize_sse4.c @@ -11,28 +11,14 @@ #include <smmintrin.h> /* SSE4.1 */ #include "./vp8_rtcd.h" -#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ #include "vp8/encoder/block.h" - -#define SELECT_EOB(i, z, x, y, q) \ - do { \ - short boost = *zbin_boost_ptr; \ - /* Technically _mm_extract_epi16() returns an int: */ \ - /* https://bugs.llvm.org/show_bug.cgi?id=41657 */ \ - short x_z = (short)_mm_extract_epi16(x, z); \ - short y_z = (short)_mm_extract_epi16(y, z); \ - int cmp = (x_z < boost) | (y_z == 0); \ - zbin_boost_ptr++; \ - if (cmp) break; \ - q = _mm_insert_epi16(q, y_z, z); \ - eob = i; \ - zbin_boost_ptr = b->zrun_zbin_boost; \ - } while (0) +#include "vpx_ports/bitops.h" /* get_lsb */ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { - char eob = 0; + int eob = -1; short *zbin_boost_ptr = b->zrun_zbin_boost; - + __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); + __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8)); __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); @@ -47,8 +33,12 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); - __m128i qcoeff0 = _mm_setzero_si128(); - __m128i qcoeff1 = _mm_setzero_si128(); + __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1; + uint32_t mask, ymask; + DECLARE_ALIGNED(16, static const uint8_t, + zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 }; /* Duplicate to all lanes. */ zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); @@ -88,23 +78,52 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { y0 = _mm_sign_epi16(y0, z0); y1 = _mm_sign_epi16(y1, z1); - /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ - SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1); + { + const __m128i zig_zag_i16_0 = + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13); + const __m128i zig_zag_i16_1 = + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13); + + /* The first part of the zig zag needs a value + * from x_minus_zbin1 and vice versa. */ + t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2); + t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80); + t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80); + x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0); + x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1); + } + + /* Check if y is nonzero and put it in zig zag order. */ + t0 = _mm_packs_epi16(y0, y1); + t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128()); + t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask)); + ymask = _mm_movemask_epi8(t0) ^ 0xffff; + + for (;;) { + t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0); + t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1); + t0 = _mm_packs_epi16(t0, t1); + mask = _mm_movemask_epi8(t0); + mask = ~mask & ymask; + if (!mask) break; + /* |eob| will contain the index of the next found element where: + * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */ + eob = get_lsb(mask); + /* Need to clear the mask from processed elements so that + * they are no longer counted in the next iteration. */ + ymask &= ~1U << eob; + /* It's safe to read ahead of this buffer if struct VP8_COMP has at + * least 32 bytes before the zrun_zbin_boost_* fields (it has 384). + * Any data read outside of the buffer is masked by the updated |ymask|. */ + zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1)); + zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7)); + qcoeff[zig_zag_mask[eob]] = 0xffff; + } + + qcoeff0 = _mm_load_si128((__m128i *)(qcoeff)); + qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8)); + qcoeff0 = _mm_and_si128(qcoeff0, y0); + qcoeff1 = _mm_and_si128(qcoeff1, y1); _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); @@ -115,5 +134,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); - *d->eob = eob; + *d->eob = eob + 1; } diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h index 5b2f31cd1..1b5cdaa6d 100644 --- a/vpx_ports/bitops.h +++ b/vpx_ports/bitops.h @@ -26,20 +26,32 @@ extern "C" { #endif -// These versions of get_msb() are only valid when n != 0 because all -// of the optimized versions are undefined when n == 0: +// These versions of get_lsb() and get_msb() are only valid when n != 0 +// because all of the optimized versions are undefined when n == 0: // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html // use GNU builtins where available. #if defined(__GNUC__) && \ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_lsb(unsigned int n) { + assert(n != 0); + return __builtin_ctz(n); +} + static INLINE int get_msb(unsigned int n) { assert(n != 0); return 31 ^ __builtin_clz(n); } #elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) +static INLINE int get_lsb(unsigned int n) { + unsigned long first_set_bit; // NOLINT(runtime/int) + _BitScanForward(&first_set_bit, n); + return first_set_bit; +} + static INLINE int get_msb(unsigned int n) { unsigned long first_set_bit; assert(n != 0); @@ -48,6 +60,13 @@ static INLINE int get_msb(unsigned int n) { } #undef USE_MSC_INTRINSICS #else +static INLINE int get_lsb(unsigned int n) { + int i; + assert(n != 0); + for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1; + return i; +} + // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { int log = 0; |