aboutsummaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/fdct_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp/arm/fdct_neon.h')
-rw-r--r--vpx_dsp/arm/fdct_neon.h119
1 files changed, 119 insertions, 0 deletions
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 193594e3d..16f5c5fc0 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -177,6 +177,45 @@ static INLINE void butterfly_one_coeff_s32_fast(
*sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
}
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac holds the following values:
+ // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+ // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+ int64x2_t ac[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+ ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+ ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+ ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+ sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+ sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+ sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+ sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+ diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+ diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+ diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
// fdct_round_shift(a * c1 +/- b * c2)
// Variant that performs normal implementation on half vector
// more accurate does 64-bit processing, takes and returns 32-bit values
@@ -207,6 +246,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
// fdct_round_shift(a * c1 +/- b * c2)
// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+ int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+ int64x2_t *sub_hi /*[2]*/) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+ sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
// more accurate does 64-bit processing, takes and returns 32-bit values
// returns narrowed results
static INLINE void butterfly_two_coeff_s32_s64_narrow(
@@ -420,4 +497,46 @@ static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
}
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+ const int64x2_t *b /*[2]*/) {
+ int64x2_t result[2];
+ result[0] = vaddq_s64(a[0], b[0]);
+ result[1] = vaddq_s64(a[1], b[1]);
+ return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+ vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+ const int64x2_t *b /*[2]*/) {
+ int64x2_t result[2];
+ result[0] = vsubq_s64(a[0], b[0]);
+ result[1] = vsubq_s64(a[1], b[1]);
+ return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+ vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+ const int32x4_t b) {
+ int64x2_t a64[2], b64[2], result[2];
+ a64[0] = vmovl_s32(vget_low_s32(a));
+ a64[1] = vmovl_s32(vget_high_s32(a));
+ b64[0] = vmovl_s32(vget_low_s32(b));
+ b64[1] = vmovl_s32(vget_high_s32(b));
+ result[0] = vaddq_s64(a64[0], b64[0]);
+ result[1] = vaddq_s64(a64[1], b64[1]);
+ return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+ const int32x4_t b) {
+ int64x2_t a64[2], b64[2], result[2];
+ a64[0] = vmovl_s32(vget_low_s32(a));
+ a64[1] = vmovl_s32(vget_high_s32(a));
+ b64[0] = vmovl_s32(vget_low_s32(b));
+ b64[1] = vmovl_s32(vget_high_s32(b));
+ result[0] = vsubq_s64(a64[0], b64[0]);
+ result[1] = vsubq_s64(a64[1], b64[1]);
+ return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_