diff options
Diffstat (limited to 'vp9/encoder/arm/neon/vp9_quantize_neon.c')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_quantize_neon.c | 75 |
1 files changed, 35 insertions, 40 deletions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index c2b55fcba..96d061436 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -11,11 +11,13 @@ #include <arm_neon.h> #include <assert.h> #include <math.h> +#include <stdint.h> #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" @@ -50,7 +52,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, } static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = @@ -65,23 +67,21 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } -static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *dequant_ptr, - int16x8_t *round, int16x8_t *quant, - int16x8_t *dequant) { - *round = vld1q_s16(round_ptr); - *quant = vld1q_s16(quant_ptr); +static VPX_FORCE_INLINE void load_fp_values( + const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) { + *round = vld1q_s16(mb_plane->round_fp); + *quant = vld1q_s16(mb_plane->quant_fp); *dequant = vld1q_s16(dequant_ptr); } static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, int16x8_t *v_quant, int16x8_t *v_dequant) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *v_round = vdupq_laneq_s16(*v_round, 1); *v_quant = vdupq_laneq_s16(*v_quant, 1); *v_dequant = vdupq_laneq_s16(*v_dequant, 1); @@ -117,27 +117,26 @@ static VPX_FORCE_INLINE void quantize_fp_8( *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); } -void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, const int16_t *quant_ptr, +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; int16x8_t v_eobmax = vdupq_n_s16(-1); int16x8_t v_round, v_quant, v_dequant; - (void)scan; + const int16_t *iscan = scan_order->iscan; - load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant, - &v_dequant); + load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant); // process dc and the first seven ac coeffs quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &v_eobmax); // now process the rest of the ac coeffs update_fp_values(&v_round, &v_quant, &v_dequant); - for (i = 8; i < count; i += 8) { + for (i = 8; i < n_coeffs; i += 8) { quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); } @@ -186,23 +185,22 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8( *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); } -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, - const int16_t *quant_ptr, +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int16x8_t eob_max = vdupq_n_s16(-1); // ROUND_POWER_OF_TWO(round_ptr[], 1) - int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant_fp); int16x8_t dequant = vld1q_s16(dequant_ptr); // dequant >> 2 is used similar to zbin as a threshold. int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); int i; + const int16_t *iscan = scan_order->iscan; - (void)scan; - (void)count; + (void)n_coeffs; // Process dc and the first seven ac coeffs. quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, @@ -258,23 +256,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *round_ptr, - const int16_t *quant_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { const int16x4_t v_zero = vdup_n_s16(0); - const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); const int16x4_t v_dequant = vld1_s16(dequant_ptr); - const int16x4_t v_round = vld1_s16(round_ptr); + const int16x4_t v_round = vld1_s16(mb_plane->round_fp); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); - - (void)scan; + const int16_t *iscan = scan_order->iscan; // DC and first 3 AC v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, @@ -349,22 +345,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, } void vp9_highbd_quantize_fp_32x32_neon( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, - const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - const int16x4_t v_quant = vld1_s16(quant_ptr); + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); const int16x4_t v_dequant = vld1_s16(dequant_ptr); const int16x4_t v_zero = vdup_n_s16(0); const int16x4_t v_round = - vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14)); + vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14)); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); - - (void)scan; + const int16_t *iscan = scan_order->iscan; // DC and first 3 AC v_mask_lo = |