1 files changed, 39 insertions, 12 deletions
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41..82c755a0c 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,26 +15,53 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
-                                 const int16_t *round_ptr, __m128i *round,
-                                 const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values(const struct macroblock_plane *const mb_plane,
+                                 __m128i *zbin, __m128i *round, __m128i *quant,
                                  const int16_t *dequant_ptr, __m128i *dequant,
-                                 const int16_t *shift_ptr, __m128i *shift) {
-  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+                                 __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
   *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
 }
 
-static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
-                                  const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const struct macroblock_plane *mb_plane,
+                                  __m128i *round, __m128i *quant,
                                   const int16_t *dequant_ptr,
                                   __m128i *dequant) {
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round_fp);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp);
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
 }