aboutsummaryrefslogtreecommitdiff
path: root/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c')
-rw-r--r--vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c60
1 files changed, 17 insertions, 43 deletions
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 33753f77b..b82b3f9db 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
return result;
}
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
- // This is simplified from the C implementation to utilise that
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
- return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
- int *const comp_cost[2]) {
- assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
- assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
- return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
- comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
- int sad_per_bit) {
- const int_mv diff =
- pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
- return ROUND_POWER_OF_TWO(
- (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
- VP9_PROB_COST_SHIFT);
-}
-
/*****************************************************************************
* This function utilizes 3 properties of the cost function lookup tables, *
* constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
@@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
*****************************************************************************/
int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
const search_site_config *cfg, MV *ref_mv,
- MV *best_mv, int search_param, int sad_per_bit,
- int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+ uint32_t start_mv_sad, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
static const uint32_t data[4] = { 0, 1, 2, 3 };
const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
- const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
- const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+ const int ref_row = ref_mv->row;
+ const int ref_col = ref_mv->col;
int_mv bmv = pack_int_mv(ref_row, ref_col);
int_mv new_bmv = bmv;
@@ -117,12 +94,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Work out the start point for the search
const uint8_t *best_address = in_what;
const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
#else
int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
#endif
- unsigned int best_sad = INT_MAX;
+ // Starting position
+ unsigned int best_sad = start_mv_sad;
int i, j, step;
// Check the prerequisite cost function properties that are easy to check
@@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
- // Check the starting position
- best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
- best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
*num00 = 0;
for (i = 0, step = 0; step < tot_steps; step++) {
@@ -143,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
int8x16_t v_inside_d;
uint32x4_t v_outside_d;
int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
int64x2_t v_blocka[2];
#else
int32x4_t v_blocka[1];
@@ -164,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
vreinterpretq_s32_s16(v_these_mv_w)));
// If none of them are inside, then move on
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
#else
horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
@@ -193,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Compute the SIMD pointer offsets.
{
-#if defined(__aarch64__) // sizeof(intptr_t) == 8
+#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8
// Load the offsets
int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
@@ -214,13 +188,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
#endif
}
- fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
- in_what_stride, (uint32_t *)&v_sad_d);
+ sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
// Look up the component cost of the residual motion vector
{
uint32_t cost[4];
- int16_t __attribute__((aligned(16))) rowcol[8];
+ DECLARE_ALIGNED(16, int16_t, rowcol[8]);
vst1q_s16(rowcol, v_diff_mv_w);
// Note: This is a use case for gather instruction
@@ -260,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Find the minimum value and index horizontally in v_sad_d
{
uint32_t local_best_sad;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
#else
uint32x2_t horiz_min_0 =
@@ -282,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
local_best_idx = vminvq_u32(v_mask_d);
#else
horiz_min_0 =
@@ -306,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
best_address = new_best_address;
v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
v_ba_q = vdupq_n_s64((intptr_t)best_address);
#else
v_ba_d = vdupq_n_s32((intptr_t)best_address);