1 files changed, 35 insertions, 40 deletions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index c2b55fcba..96d061436 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -11,11 +11,13 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include <math.h>
+#include <stdint.h>
 
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -50,7 +52,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
 }
 
 static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
   const int16x4_t v_eobmax_3210 =
@@ -65,23 +67,21 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
 
   return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
 
-static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
-                                            const int16_t *quant_ptr,
-                                            const int16_t *dequant_ptr,
-                                            int16x8_t *round, int16x8_t *quant,
-                                            int16x8_t *dequant) {
-  *round = vld1q_s16(round_ptr);
-  *quant = vld1q_s16(quant_ptr);
+static VPX_FORCE_INLINE void load_fp_values(
+    const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+    int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+  *round = vld1q_s16(mb_plane->round_fp);
+  *quant = vld1q_s16(mb_plane->quant_fp);
   *dequant = vld1q_s16(dequant_ptr);
 }
 
 static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
                                               int16x8_t *v_quant,
                                               int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *v_round = vdupq_laneq_s16(*v_round, 1);
   *v_quant = vdupq_laneq_s16(*v_quant, 1);
   *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
@@ -117,27 +117,26 @@ static VPX_FORCE_INLINE void quantize_fp_8(
   *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
 }
 
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
   int16x8_t v_round, v_quant, v_dequant;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
-                 &v_dequant);
+  load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
   // process dc and the first seven ac coeffs
   quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
                 dqcoeff_ptr, &v_eobmax);
 
   // now process the rest of the ac coeffs
   update_fp_values(&v_round, &v_quant, &v_dequant);
-  for (i = 8; i < count; i += 8) {
+  for (i = 8; i < n_coeffs; i += 8) {
     quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
                   qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
   }
@@ -186,23 +185,22 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8(
   *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
 }
 
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   int16x8_t eob_max = vdupq_n_s16(-1);
   // ROUND_POWER_OF_TWO(round_ptr[], 1)
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
   // dequant >> 2 is used similar to zbin as a threshold.
   int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
-  (void)scan;
-  (void)count;
+  (void)n_coeffs;
 
   // Process dc and the first seven ac coeffs.
   quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
@@ -258,23 +256,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const int16x4_t v_zero = vdup_n_s16(0);
-  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
-  const int16x4_t v_round = vld1_s16(round_ptr);
+  const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
   int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // DC and first 3 AC
   v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
@@ -349,22 +345,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
-  const int16x4_t v_quant = vld1_s16(quant_ptr);
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
   const int16x4_t v_zero = vdup_n_s16(0);
   const int16x4_t v_round =
-      vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+      vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
   int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // DC and first 3 AC
   v_mask_lo =