diff options
Diffstat (limited to 'vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 60 |
1 files changed, 17 insertions, 43 deletions
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 33753f77b..b82b3f9db 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { return result; } -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); - assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - /***************************************************************************** * This function utilizes 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * @@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, *****************************************************************************/ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { static const uint32_t data[4] = { 0, 1, 2, 3 }; const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); @@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; @@ -117,12 +94,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); #else int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); #endif - unsigned int best_sad = INT_MAX; + // Starting position + unsigned int best_sad = start_mv_sad; int i, j, step; // Check the prerequisite cost function properties that are easy to check @@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { @@ -143,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, int8x16_t v_inside_d; uint32x4_t v_outside_d; int32x4_t v_cost_d, v_sad_d; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_blocka[2]; #else int32x4_t v_blocka[1]; @@ -164,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, vreinterpretq_s32_s16(v_these_mv_w))); // If none of them are inside, then move on -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); #else horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), @@ -193,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Compute the SIMD pointer offsets. { -#if defined(__aarch64__) // sizeof(intptr_t) == 8 +#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8 // Load the offsets int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); @@ -214,13 +188,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, #endif } - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); // Look up the component cost of the residual motion vector { uint32_t cost[4]; - int16_t __attribute__((aligned(16))) rowcol[8]; + DECLARE_ALIGNED(16, int16_t, rowcol[8]); vst1q_s16(rowcol, v_diff_mv_w); // Note: This is a use case for gather instruction @@ -260,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Find the minimum value and index horizontally in v_sad_d { uint32_t local_best_sad; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); #else uint32x2_t horiz_min_0 = @@ -282,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_idx = vminvq_u32(v_mask_d); #else horiz_min_0 = @@ -306,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, best_address = new_best_address; v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 v_ba_q = vdupq_n_s64((intptr_t)best_address); #else v_ba_d = vdupq_n_s32((intptr_t)best_address); |