aboutsummaryrefslogtreecommitdiff
path: root/vp9/encoder/arm/neon/vp9_quantize_neon.c
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/encoder/arm/neon/vp9_quantize_neon.c')
-rw-r--r--vp9/encoder/arm/neon/vp9_quantize_neon.c75
1 files changed, 35 insertions, 40 deletions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index c2b55fcba..96d061436 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -11,11 +11,13 @@
#include <arm_neon.h>
#include <assert.h>
#include <math.h>
+#include <stdint.h>
#include "./vpx_config.h"
#include "vpx_mem/vpx_mem.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/encoder/vp9_encoder.h"
@@ -50,7 +52,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
}
static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
return (uint16_t)vmaxvq_s16(v_eobmax);
#else
const int16x4_t v_eobmax_3210 =
@@ -65,23 +67,21 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
}
-static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *dequant_ptr,
- int16x8_t *round, int16x8_t *quant,
- int16x8_t *dequant) {
- *round = vld1q_s16(round_ptr);
- *quant = vld1q_s16(quant_ptr);
+static VPX_FORCE_INLINE void load_fp_values(
+ const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+ int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+ *round = vld1q_s16(mb_plane->round_fp);
+ *quant = vld1q_s16(mb_plane->quant_fp);
*dequant = vld1q_s16(dequant_ptr);
}
static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
int16x8_t *v_quant,
int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*v_round = vdupq_laneq_s16(*v_round, 1);
*v_quant = vdupq_laneq_s16(*v_quant, 1);
*v_dequant = vdupq_laneq_s16(*v_dequant, 1);
@@ -117,27 +117,26 @@ static VPX_FORCE_INLINE void quantize_fp_8(
*v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
}
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
int i;
int16x8_t v_eobmax = vdupq_n_s16(-1);
int16x8_t v_round, v_quant, v_dequant;
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
- load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
- &v_dequant);
+ load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
// process dc and the first seven ac coeffs
quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
dqcoeff_ptr, &v_eobmax);
// now process the rest of the ac coeffs
update_fp_values(&v_round, &v_quant, &v_dequant);
- for (i = 8; i < count; i += 8) {
+ for (i = 8; i < n_coeffs; i += 8) {
quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
}
@@ -186,23 +185,22 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8(
*v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
}
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
int16x8_t eob_max = vdupq_n_s16(-1);
// ROUND_POWER_OF_TWO(round_ptr[], 1)
- int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
- int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+ int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
int16x8_t dequant = vld1q_s16(dequant_ptr);
// dequant >> 2 is used similar to zbin as a threshold.
int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
int i;
+ const int16_t *iscan = scan_order->iscan;
- (void)scan;
- (void)count;
+ (void)n_coeffs;
// Process dc and the first seven ac coeffs.
quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
@@ -258,23 +256,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
}
void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const int16x4_t v_zero = vdup_n_s16(0);
- const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
const int16x4_t v_dequant = vld1_s16(dequant_ptr);
- const int16x4_t v_round = vld1_s16(round_ptr);
+ const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
uint16x4_t v_mask_lo, v_mask_hi;
int16x8_t v_eobmax = vdupq_n_s16(-1);
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// DC and first 3 AC
v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
@@ -349,22 +345,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
}
void vp9_highbd_quantize_fp_32x32_neon(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
- const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
- const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
+ const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
const int16x4_t v_dequant = vld1_s16(dequant_ptr);
const int16x4_t v_zero = vdup_n_s16(0);
const int16x4_t v_round =
- vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+ vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
uint16x4_t v_mask_lo, v_mask_hi;
int16x8_t v_eobmax = vdupq_n_s16(-1);
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// DC and first 3 AC
v_mask_lo =