/****************************************************************************** * * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ****************************************************************************** * @file hme_subpel.c * * @brief * Subpel refinement modules for ME algo * * @author * Ittiam * * * List of Functions * hme_qpel_interp_avg() * hme_subpel_refine_ctblist_bck() * hme_subpel_refine_ctblist_fwd() * hme_refine_bidirect() * hme_subpel_refinement() * hme_subpel_refine_ctb_fwd() * hme_subpel_refine_ctb_bck() * hme_create_bck_inp() * hme_subpel_refine_search_node() ****************************************************************************** */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include #include #include #include #include #include #include /* User include files */ #include "ihevc_typedefs.h" #include "itt_video_api.h" #include "ihevce_api.h" #include "rc_cntrl_param.h" #include "rc_frame_info_collector.h" #include "rc_look_ahead_params.h" #include "ihevc_defs.h" #include "ihevc_structs.h" #include "ihevc_platform_macros.h" #include "ihevc_deblk.h" #include "ihevc_itrans_recon.h" #include "ihevc_chroma_itrans_recon.h" #include "ihevc_chroma_intra_pred.h" #include "ihevc_intra_pred.h" #include "ihevc_inter_pred.h" #include "ihevc_mem_fns.h" #include "ihevc_padding.h" #include "ihevc_weighted_pred.h" #include "ihevc_sao.h" #include "ihevc_resi_trans.h" #include "ihevc_quant_iquant_ssd.h" #include "ihevc_cabac_tables.h" #include "ihevce_defs.h" #include "ihevce_lap_enc_structs.h" #include "ihevce_multi_thrd_structs.h" #include "ihevce_multi_thrd_funcs.h" #include "ihevce_me_common_defs.h" #include "ihevce_had_satd.h" #include "ihevce_error_codes.h" #include "ihevce_bitstream.h" #include "ihevce_cabac.h" #include "ihevce_rdoq_macros.h" #include "ihevce_function_selector.h" #include "ihevce_enc_structs.h" #include "ihevce_entropy_structs.h" #include "ihevce_cmn_utils_instr_set_router.h" #include "ihevce_enc_loop_structs.h" #include "ihevce_bs_compute_ctb.h" #include "ihevce_global_tables.h" #include "ihevce_dep_mngr_interface.h" #include "hme_datatype.h" #include "hme_interface.h" #include "hme_common_defs.h" #include "hme_defs.h" #include "ihevce_me_instr_set_router.h" #include "hme_globals.h" #include "hme_utils.h" #include "hme_coarse.h" #include "hme_fullpel.h" #include "hme_subpel.h" #include "hme_refine.h" #include "hme_err_compute.h" #include "hme_common_utils.h" #include "hme_search_algo.h" #include "ihevce_stasino_helpers.h" #include "ihevce_common_utils.h" /*****************************************************************************/ /* Function Definitions */ /*****************************************************************************/ void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id) { U08 *pu1_src1, *pu1_src2, *pu1_dst; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; /*************************************************************************/ /* For a given QPEL pt, we need to determine the 2 source pts that are */ /* needed to do the QPEL averaging. The logic to do this is as follows */ /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are */ /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0 */ /* pt of th reference blk that is colocated to the inp blk. */ /* A j E k B */ /* l m n o p */ /* F q G r H */ /* s t u v w */ /* C x I y D */ /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/ /* and (1,1) respectively in the fpel buffer (id = 0) */ /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf */ /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf */ /* G is hxhy pt in offset 0,0 in hxhy buf */ /* All above offsets are computed w.r.t. motion displaced pt in */ /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and */ /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G */ /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3 */ /* If we consider pt v to be derived. v has a fractional comp of 3, 3 */ /* v is avg of H and I. So the table look up of v should give following */ /* buf 1 (H) : offset = (1, 0) buf id = 2. */ /* buf 2 (I) : offset = 0 , 1) buf id = 1. */ /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1. */ /*************************************************************************/ i4_mv_x_frac = i4_mv_x & 3; i4_mv_y_frac = i4_mv_y & 3; i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2) { /* This is case for fxfy/hxfy/fxhy/hxhy */ ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride); ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride; return; } pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset; pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride); pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2]; pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset; pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride); pu1_dst = ps_prms->apu1_interp_out[i4_buf_id]; hevc_avg_2d( pu1_src1, pu1_src2, ps_prms->i4_ref_stride, ps_prms->i4_ref_stride, ps_prms->i4_blk_wd, ps_prms->i4_blk_ht, pu1_dst, ps_prms->i4_out_stride); ps_prms->pu1_final_out = pu1_dst; ps_prms->i4_final_out_stride = ps_prms->i4_out_stride; } static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse( interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride, FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt) { pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride); pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride); } static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse( interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride, FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt) { pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride); pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride); } /******************************************************************************** * @fn hme_qpel_interp_comprehensive * * @brief Interpolates 2 qpel points by hpel averaging * * @param[in,out] ps_prms: Both input buffer ptrs and location of output * * @param[in] i4_mv_x : x component of motion vector in QPEL units * * @param[in] i4_mv_y : y component of motion vector in QPEL units * * @param[in] i4_grid_mask : mask which determines qpels to be computed * * @param[out] ppu1_final : storage for final buffer pointers * * @param[out] pi4_final_stride : storage for final buffer strides * * @return None ******************************************************************************** */ static __inline void hme_qpel_interp_comprehensive( interp_prms_t *ps_prms, U08 **ppu1_final, S32 *pi4_final_stride, S32 i4_mv_x, S32 i4_mv_y, S32 i4_grid_mask, ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) { S32 pt_select_for_TB, pt_select_for_LR; S32 dx, dy, dydx; S32 vert_func_selector, horz_func_selector; S32 i4_ref_stride = ps_prms->i4_ref_stride; pt_select_for_TB = ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1)); pt_select_for_LR = ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1)); dx = (i4_mv_x & 3); dy = (i4_mv_y & 3); dydx = (dx + (dy << 2)); vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx]; horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx]; /* case descriptions */ /* Let T = (gridmask & T) & B = (gridmask & B) */ /* & hp = pt is an hpel or an fpel */ /* & r = reuse possible */ /* 0 => T || B = 0 */ /* 1 => (!T) && (B) && hp */ /* 2 => (T) && (!B) && hp */ /* 3 => (!T) && (B) && !hp */ /* 4 => (T) && (!B) && !hp */ /* 5 => (T) && (B) && !hp && r */ /* 6 => (T) && (B) && !hp && !r */ /* 7 => (T) && (B) && hp */ switch(vert_func_selector) { case 0: { break; } case 1: { S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mvyp1 = (i4_mv_y + 1); i4_mv_x_frac = dx; i4_mv_y_frac = i4_mvyp1 & 3; i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[3] = i4_ref_stride; break; } case 2: { S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mvym1 = (i4_mv_y - 1); i4_mv_x_frac = dx; i4_mv_y_frac = i4_mvym1 & 3; i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[1] = i4_ref_stride; break; } case 3: { ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride); break; } case 4: { ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride); break; } case 5: { ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse( ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride); break; } case 6: { hme_qpel_interp_avg_2pt_vert_no_reuse( ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride, ps_me_optimised_function_list->pf_qpel_interp_avg_1pt); break; } case 7: { S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mvyp1 = (i4_mv_y + 1); S32 i4_mvym1 = (i4_mv_y - 1); i4_mv_x_frac = dx; i4_mv_y_frac = i4_mvyp1 & 3; i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[3] = i4_ref_stride; i4_mv_y_frac = i4_mvym1 & 3; i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[1] = i4_ref_stride; break; } } /* case descriptions */ /* Let L = (gridmask & L) & R = (gridmask & R) */ /* & hp = pt is an hpel or an fpel */ /* & r = reuse possible */ /* 0 => L || R = 0 */ /* 1 => (!L) && (R) && hp */ /* 2 => (L) && (!R) && hp */ /* 3 => (!L) && (R) && !hp */ /* 4 => (L) && (!R) && !hp */ /* 5 => (L) && (R) && !hp && r */ /* 6 => (L) && (R) && !hp && !r */ /* 7 => (L) && (R) && hp */ switch(horz_func_selector) { case 0: { break; } case 1: { S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mvxp1 = (i4_mv_x + 1); i4_mv_x_frac = i4_mvxp1 & 3; i4_mv_y_frac = dy; i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[2] = i4_ref_stride; break; } case 2: { S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mvxm1 = (i4_mv_x - 1); i4_mv_x_frac = i4_mvxm1 & 3; i4_mv_y_frac = dy; i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[0] = i4_ref_stride; break; } case 3: { ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride); break; } case 4: { ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride); break; } case 5: { ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse( ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride); break; } case 6: { hme_qpel_interp_avg_2pt_horz_no_reuse( ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride, ps_me_optimised_function_list->pf_qpel_interp_avg_1pt); break; } case 7: { S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; qpel_input_buf_cfg_t *ps_inp_cfg; S32 i4_mvxp1 = (i4_mv_x + 1); S32 i4_mvxm1 = (i4_mv_x - 1); i4_mv_x_frac = i4_mvxp1 & 3; i4_mv_y_frac = dy; i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[2] = i4_ref_stride; i4_mv_x_frac = i4_mvxm1 & 3; i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; /* Derive the descriptor that has all offset and size info */ ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); pi4_final_stride[0] = i4_ref_stride; break; } } } /** ******************************************************************************** * @fn S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms, * search_results_t *ps_search_results, * layer_ctxt_t *ps_curr_layer, * U08 **ppu1_pred) * * * @brief Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are * best L0 and L1 bufs respectively for the entire CU * * @param[in] ps_prms: subpel prms input to this function * * @param[in] ps_curr_layer: points to the current layer ctxt * * @return The best BI cost of best uni cost, whichever better ******************************************************************************** */ void hme_compute_pred_and_evaluate_bi( inter_cu_results_t *ps_cu_results, inter_pu_results_t *ps_pu_results, inter_ctb_prms_t *ps_inter_ctb_prms, part_type_results_t *ps_part_type_result, ULWORD64 *pu8_winning_pred_sigmaXSquare, ULWORD64 *pu8_winning_pred_sigmaX, ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list, ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) { /* Idx0 - Uni winner */ /* Idx1 - Uni runner-up */ /* Idx2 - Bi winner */ hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS]; err_prms_t s_err_prms; interp_prms_t s_interp_prms; PF_SAD_FXN_T pf_err_compute; S32 i, j; S32 x_off, y_off, x_pic, y_pic; S32 i4_sad_grid; U08 e_cu_size; S32 i4_part_type; U08 u1_cu_size; S32 shift; S32 x_part, y_part, num_parts; S32 inp_stride, ref_stride; U08 au1_pred_buf_array_indixes[3]; S32 cur_iter_best_cost; S32 uni_cost, bi_cost, best_cost, tot_cost; /* Idx0 - Uni winner */ /* Idx1 - Bi winner */ ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS]; ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS]; #if USE_NOISE_TERM_DURING_BICAND_SEARCH S32 i4_noise_term; #endif interp_prms_t *ps_interp_prms = &s_interp_prms; S32 best_cand_in_opp_dir_idx = 0; S32 is_best_cand_an_intra = 0; U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy; #if USE_NOISE_TERM_DURING_BICAND_SEARCH const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT; #endif tot_cost = 0; /* Start of the CU w.r.t. CTB */ x_off = ps_cu_results->u1_x_off; y_off = ps_cu_results->u1_y_off; inp_stride = ps_inter_ctb_prms->i4_inp_stride; ref_stride = ps_inter_ctb_prms->i4_rec_stride; ps_interp_prms->i4_ref_stride = ref_stride; /* Start of the CU w.r.t. Pic 0,0 */ x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off; y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off; u1_cu_size = ps_cu_results->u1_cu_size; e_cu_size = u1_cu_size; shift = (S32)e_cu_size; i4_part_type = ps_part_type_result->u1_part_type; num_parts = gau1_num_parts_in_part_type[i4_part_type]; for(i = 0; i < 3; i++) { hme_init_pred_buf_info( &as_pred_buf_data[i], &ps_inter_ctb_prms->s_pred_buf_mngr, (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2, (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2, (PART_TYPE_T)i4_part_type); au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id; } for(j = 0; j < num_parts; j++) { UWORD8 *apu1_hpel_ref[2][4]; PART_ID_T e_part_id; BLK_SIZE_T e_blk_size; WORD8 i1_ref_idx; UWORD8 pred_dir; WORD32 ref_offset, inp_offset, wd, ht; pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result; mv_t *aps_mv[2]; UWORD8 num_active_ref_opp; UWORD8 num_results_per_part; WORD32 luma_weight_ref1, luma_offset_ref1; WORD32 luma_weight_ref2, luma_offset_ref2; WORD32 pu_node2_found = 0; e_part_id = ge_part_type_to_part_id[i4_part_type][j]; e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id]; x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift; y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift; ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride; inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset; pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode; ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]); if(PRED_L0 == pred_dir) { i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx; aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv); num_active_ref_opp = ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled); num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id]; ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id]; ASSERT(i1_ref_idx >= 0); apu1_hpel_ref[0][0] = (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) + ref_offset; apu1_hpel_ref[0][1] = ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] + ref_offset; apu1_hpel_ref[0][2] = ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] + ref_offset; apu1_hpel_ref[0][3] = ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] + ref_offset; luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] ->s_weight_offset.i2_luma_weight; luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] ->s_weight_offset.i2_luma_offset; } else { i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx; aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv); ASSERT(i1_ref_idx >= 0); num_active_ref_opp = ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled); num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id]; ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id]; apu1_hpel_ref[0][0] = (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) + ref_offset; apu1_hpel_ref[0][1] = ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] + ref_offset; apu1_hpel_ref[0][2] = ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] + ref_offset; apu1_hpel_ref[0][3] = ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] + ref_offset; luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] ->s_weight_offset.i2_luma_weight; luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] ->s_weight_offset.i2_luma_offset; } if(aps_mv[0]->i2_mvx == INTRA_MV) { uni_cost = ps_pu_node1->i4_tot_cost; cur_iter_best_cost = ps_pu_node1->i4_tot_cost; best_cost = MIN(uni_cost, cur_iter_best_cost); tot_cost += best_cost; continue; } ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size]; ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size]; ps_interp_prms->i4_out_stride = MAX_CU_SIZE; if(num_active_ref_opp) { if(PRED_L0 == pred_dir) { if(ps_pu_results->u1_num_results_per_part_l1[e_part_id]) { ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id]; pu_node2_found = 1; } } else { if(ps_pu_results->u1_num_results_per_part_l0[e_part_id]) { ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id]; pu_node2_found = 1; } } } if(!pu_node2_found) { bi_cost = INT_MAX >> 1; s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred; ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0]; ps_me_optimised_function_list->pf_qpel_interp_avg_generic( ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0); if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0]) { as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX; as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out; as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride; } if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) { hme_compute_sigmaX_and_sigmaXSquared( as_pred_buf_data[0][j].pu1_pred, as_pred_buf_data[0][j].i4_pred_stride, &au8_sigmaX[0][j], &au8_sigmaXSquared[0][j], ps_interp_prms->i4_blk_wd, ps_interp_prms->i4_blk_ht, ps_interp_prms->i4_blk_wd, ps_interp_prms->i4_blk_ht, 0, 1); } } else { i = 0; bi_cost = MAX_32BIT_VAL; is_best_cand_an_intra = 0; best_cand_in_opp_dir_idx = 0; pred_dir = ps_pu_node2[i].pu.b2_pred_mode; if(PRED_L0 == pred_dir) { i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx; aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv); ASSERT(i1_ref_idx >= 0); apu1_hpel_ref[1][0] = (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] ->s_yuv_buf_desc.pv_y_buf) + ref_offset; //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset; apu1_hpel_ref[1][1] = ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] + ref_offset; apu1_hpel_ref[1][2] = ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] + ref_offset; apu1_hpel_ref[1][3] = ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] + ref_offset; luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] ->s_weight_offset.i2_luma_weight; luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] ->s_weight_offset.i2_luma_offset; } else { i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx; aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv); ASSERT(i1_ref_idx >= 0); apu1_hpel_ref[1][0] = (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] ->s_yuv_buf_desc.pv_y_buf) + ref_offset; //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset; apu1_hpel_ref[1][1] = ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] + ref_offset; apu1_hpel_ref[1][2] = ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] + ref_offset; apu1_hpel_ref[1][3] = ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] + ref_offset; luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] ->s_weight_offset.i2_luma_weight; luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] ->s_weight_offset.i2_luma_offset; } if(aps_mv[1]->i2_mvx == INTRA_MV) { uni_cost = ps_pu_node1->i4_tot_cost; cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost; if(cur_iter_best_cost < bi_cost) { bi_cost = cur_iter_best_cost; best_cand_in_opp_dir_idx = i; is_best_cand_an_intra = 1; } best_cost = MIN(uni_cost, bi_cost); tot_cost += best_cost; continue; } s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred; ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0]; ps_me_optimised_function_list->pf_qpel_interp_avg_generic( ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0); if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0]) { as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX; as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out; as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride; } if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) { hme_compute_sigmaX_and_sigmaXSquared( as_pred_buf_data[0][j].pu1_pred, as_pred_buf_data[0][j].i4_pred_stride, &au8_sigmaX[0][j], &au8_sigmaXSquared[0][j], ps_interp_prms->i4_blk_wd, ps_interp_prms->i4_blk_ht, ps_interp_prms->i4_blk_wd, ps_interp_prms->i4_blk_ht, 0, 1); } s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred; ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0]; ps_me_optimised_function_list->pf_qpel_interp_avg_generic( ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0); if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0]) { as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX; as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out; as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride; } ps_cmn_utils_optimised_function_list->pf_wt_avg_2d( as_pred_buf_data[0][j].pu1_pred, as_pred_buf_data[1][j].pu1_pred, as_pred_buf_data[0][j].i4_pred_stride, as_pred_buf_data[1][j].i4_pred_stride, wd, ht, as_pred_buf_data[2][j].pu1_pred, as_pred_buf_data[2][j].i4_pred_stride, luma_weight_ref1, luma_weight_ref2, luma_offset_ref1, luma_offset_ref2, ps_inter_ctb_prms->wpred_log_wdc); if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) { hme_compute_sigmaX_and_sigmaXSquared( as_pred_buf_data[2][j].pu1_pred, as_pred_buf_data[2][j].i4_pred_stride, &au8_sigmaX[1][j], &au8_sigmaXSquared[1][j], ps_interp_prms->i4_blk_wd, ps_interp_prms->i4_blk_ht, ps_interp_prms->i4_blk_wd, ps_interp_prms->i4_blk_ht, 0, 1); } s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset; s_err_prms.i4_inp_stride = inp_stride; s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride; s_err_prms.i4_part_mask = (ENABLE_2Nx2N); s_err_prms.i4_grid_mask = 1; s_err_prms.pi4_sad_grid = &i4_sad_grid; s_err_prms.i4_blk_wd = wd; s_err_prms.i4_blk_ht = ht; s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred; s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list; if(ps_inter_ctb_prms->u1_use_satd) { pf_err_compute = compute_satd_8bit; } else { pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit; } pf_err_compute(&s_err_prms); #if USE_NOISE_TERM_DURING_BICAND_SEARCH if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) { unsigned long u4_shift_val; ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX; ULWORD64 u8_temp_var, u8_temp_var1; S32 i4_bits_req; S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT; u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]); u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX; if(e_cu_size == CU_8x8) { PART_ID_T e_part_id = (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1)); u4_shift_val = ihevce_calc_stim_injected_variance( ps_inter_ctb_prms->pu8_part_src_sigmaX, ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, &u8_src_variance, i4_default_src_wt, 0, ps_inter_ctb_prms->wpred_log_wdc, e_part_id); } else { u4_shift_val = ihevce_calc_stim_injected_variance( ps_inter_ctb_prms->pu8_part_src_sigmaX, ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, &u8_src_variance, i4_default_src_wt, 0, ps_inter_ctb_prms->wpred_log_wdc, e_part_id); } u8_pred_variance = u8_pred_variance >> u4_shift_val; GETRANGE64(i4_bits_req, u8_pred_variance); if(i4_bits_req > 27) { u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27); u8_src_variance = u8_src_variance >> (i4_bits_req - 27); } if(u8_src_variance == u8_pred_variance) { u8_temp_var = (1 << STIM_Q_FORMAT); } else { u8_temp_var = (2 * u8_src_variance * u8_pred_variance); u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT)); u8_temp_var1 = (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance); u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2)); u8_temp_var = (u8_temp_var / u8_temp_var1); } i4_noise_term = (UWORD32)u8_temp_var; i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier; ASSERT(i4_noise_term >= 0); u8_temp_var = i4_sad_grid; u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term)); u8_temp_var += (1 << ((i4_q_level)-1)); i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level)); } #endif cur_iter_best_cost = i4_sad_grid; cur_iter_best_cost += ps_pu_node1->i4_mv_cost; cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost; if(cur_iter_best_cost < bi_cost) { bi_cost = cur_iter_best_cost; best_cand_in_opp_dir_idx = i; is_best_cand_an_intra = 0; } } uni_cost = ps_pu_node1->i4_tot_cost; #if USE_NOISE_TERM_DURING_BICAND_SEARCH if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) { unsigned long u4_shift_val; ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX; ULWORD64 u8_temp_var, u8_temp_var1; S32 i4_bits_req; S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT; S08 i1_ref_idx = (PRED_L0 == ps_pu_node1->pu.b2_pred_mode) ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx] : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx]; S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost; u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]); u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX; if(e_cu_size == CU_8x8) { PART_ID_T e_part_id = (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1)); u4_shift_val = ihevce_calc_stim_injected_variance( ps_inter_ctb_prms->pu8_part_src_sigmaX, ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, &u8_src_variance, ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx], ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx], ps_inter_ctb_prms->wpred_log_wdc, e_part_id); } else { u4_shift_val = ihevce_calc_stim_injected_variance( ps_inter_ctb_prms->pu8_part_src_sigmaX, ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, &u8_src_variance, ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx], ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx], ps_inter_ctb_prms->wpred_log_wdc, e_part_id); } u8_pred_variance = u8_pred_variance >> (u4_shift_val); GETRANGE64(i4_bits_req, u8_pred_variance); if(i4_bits_req > 27) { u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27); u8_src_variance = u8_src_variance >> (i4_bits_req - 27); } if(u8_src_variance == u8_pred_variance) { u8_temp_var = (1 << STIM_Q_FORMAT); } else { u8_temp_var = (2 * u8_src_variance * u8_pred_variance); u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT)); u8_temp_var1 = (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance); u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2)); u8_temp_var = (u8_temp_var / u8_temp_var1); } i4_noise_term = (UWORD32)u8_temp_var; i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier; ASSERT(i4_noise_term >= 0); u8_temp_var = i4_sad; u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term)); u8_temp_var += (1 << ((i4_q_level)-1)); i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level)); uni_cost = i4_sad + ps_pu_node1->i4_mv_cost; pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j]; pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j]; } #endif if((bi_cost < uni_cost) && (!is_best_cand_an_intra)) { if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) { pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j]; pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j]; } if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode) { ps_pu_node1->pu.b2_pred_mode = PRED_BI; if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode) { ps_pu_node1->pu.mv.i1_l1_ref_idx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx; ps_pu_node1->pu.mv.s_l1_mv.i2_mvx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx; ps_pu_node1->pu.mv.s_l1_mv.i2_mvy = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy; } else { ps_pu_node1->pu.mv.i1_l1_ref_idx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx; ps_pu_node1->pu.mv.s_l1_mv.i2_mvx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx; ps_pu_node1->pu.mv.s_l1_mv.i2_mvy = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy; } } else { ps_pu_node1->pu.b2_pred_mode = PRED_BI; if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode) { ps_pu_node1->pu.mv.i1_l0_ref_idx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx; ps_pu_node1->pu.mv.s_l0_mv.i2_mvx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx; ps_pu_node1->pu.mv.s_l0_mv.i2_mvy = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy; } else { ps_pu_node1->pu.mv.i1_l0_ref_idx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx; ps_pu_node1->pu.mv.s_l0_mv.i2_mvx = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx; ps_pu_node1->pu.mv.s_l0_mv.i2_mvy = ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy; } } ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost; } best_cost = MIN(uni_cost, bi_cost); tot_cost += best_cost; } hme_debrief_bipred_eval( ps_part_type_result, as_pred_buf_data, &ps_inter_ctb_prms->s_pred_buf_mngr, au1_pred_buf_array_indixes, ps_cmn_utils_optimised_function_list); ps_part_type_result->i4_tot_cost = tot_cost; } WORD32 hme_evalsatd_pt_pu_8x8_tu_rec( err_prms_t *ps_prms, WORD32 lambda, WORD32 lambda_q_shift, WORD32 i4_frm_qstep, me_func_selector_t *ps_func_selector) { S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */ S32 i4_satd_8x8; S16 *pi2_had_out; S32 i4_tu_split_flag = 0; S32 i4_tu_early_cbf = 0; S32 i4_early_cbf = 1; // S32 i4_i, i4_k; S32 i4_total_satd_cost = 0; S32 best_cost_tu_split; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *api4_tu_split[HAD_32x32 + 1]; S32 *api4_tu_early_cbf[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags; S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; /* Initialize tu_split_cost to "0" */ ps_prms->i4_tu_split_cost = 0; pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &i4_satd_8x8; api4_satd_pu[HAD_16x16] = NULL; api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ api4_tu_split[HAD_4x4] = NULL; api4_tu_split[HAD_8x8] = &i4_tu_split_flag; api4_tu_split[HAD_16x16] = NULL; api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ api4_tu_early_cbf[HAD_4x4] = NULL; api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf; api4_tu_early_cbf[HAD_16x16] = NULL; api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ /* Return value is merge of both best_stad_cost and tu_split_flags */ best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r( pu1_inp, inp_stride, pu1_ref, ref_stride, pi2_had_out, 8, api4_satd_pu, api4_tu_split, api4_tu_early_cbf, 0, 2, 0, 0, i4_frm_qstep, 0, ps_prms->u1_max_tr_depth, ps_prms->u1_max_tr_size, &(ps_prms->i4_tu_split_cost), NULL); /* For SATD computation following TU size are assumed for a 8x8 CU */ /* 8 for 2Nx2N, 4 for Nx2N,2NxN */ i4_total_satd_cost = best_cost_tu_split >> 2; /* Second last bit has the tu pslit flag */ i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1; /* Last bit corrsponds to the Early CBF flag */ i4_early_cbf = (best_cost_tu_split & 0x1); /* Update 8x8 SATDs */ pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8; pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag; pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf; return i4_total_satd_cost; } //#endif /** ******************************************************************************** * @fn S32 hme_evalsatd_update_1_best_result_pt_pu_16x16 * * @brief Evaluates the SATD with partial updates for all the best partitions * of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds * * @param[inout] ps_prms: error prms containg current and ref ptr, strides, * pointer to sad grid of each partitions * * @return None ******************************************************************************** */ void hme_evalsatd_update_2_best_results_pt_pu_16x16( err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms) { S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */ S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */ S32 i4_satd_16x16; /* 16x16 satd cost */ S32 i; S16 ai2_8x8_had[256]; S16 *pi2_y0; U08 *pu1_src, *pu1_pred; S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0; S32 *ppi4_hsad; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; api4_satd_pu[HAD_16x16] = &i4_satd_16x16; api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ ppi4_hsad = api4_satd_pu[HAD_16x16]; /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ for(i = 0; i < 4; i++) { pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8; pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8; pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8; pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16); ihevce_had_8x8_using_4_4x4( pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4); } /* For SATD computation following TU size are assumed for a 16x16 CU */ /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs */ /* Update 8x8 SATDs */ /* Modified to cost calculation using only 4x4 SATD */ // ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; // ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7]; // ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13]; // ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; /* Update 16x16 SATDs */ pi4_sad_grid[PART_ID_2Nx2N] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0]; pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1]; pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2]; pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3]; /* Update 8x16 / 16x8 SATDs */ pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2]; pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3]; pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1]; pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3]; /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */ pi4_sad_grid[PART_ID_nLx2N_L] = ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12]; pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] + ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R]; pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] + ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L]; pi4_sad_grid[PART_ID_nRx2N_R] = ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15]; pi4_sad_grid[PART_ID_2NxnU_T] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3]; pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] + ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B]; pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] + ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T]; pi4_sad_grid[PART_ID_2NxnD_B] = ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; /* Call the update results function */ { S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost; mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt; S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0]; S32 best_node_cost; S32 second_best_node_cost; /*For each valid partition, update the refine_prm structure to reflect the best and second best candidates for that partition*/ for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++) { S32 update_required = 0; S32 part_id = pi4_valid_part_ids[i4_count]; S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count; /* Use a pre-computed cost instead of freshly evaluating subpel cost */ i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; /*Calculate total cost*/ i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff); i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost); /*****************************************************************/ /* We do not labor through the results if the total cost worse */ /* than the last of the results. */ /*****************************************************************/ best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]); second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]); if(i4_tot_cost < second_best_node_cost) { update_required = 2; /*************************************************************/ /* Identify where the current result isto be placed.Basically*/ /* find the node which has cost just higher thannodeundertest*/ /*************************************************************/ if(i4_tot_cost < best_node_cost) { update_required = 1; } else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index]) { update_required = 0; } if(update_required == 2) { ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost; ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost; ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x; ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y; ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx; } else if(update_required == 1) { ps_subpel_refine_ctxt->i2_tot_cost[1][index] = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; ps_subpel_refine_ctxt->i2_mv_cost[1][index] = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_subpel_refine_ctxt->i2_mv_x[0][index]; ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_subpel_refine_ctxt->i2_mv_y[0][index]; ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_subpel_refine_ctxt->i2_ref_idx[0][index]; ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost; ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost; ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x; ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y; ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx; } } } } } //#if COMPUTE_16x16_R == C void hme_evalsatd_update_1_best_result_pt_pu_16x16( err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms) { S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */ S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */ S32 i4_satd_16x16; /* 16x16 satd cost */ S32 i; S16 ai2_8x8_had[256]; S16 *pi2_y0; U08 *pu1_src, *pu1_pred; S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0; S32 *ppi4_hsad; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; api4_satd_pu[HAD_16x16] = &i4_satd_16x16; api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ ppi4_hsad = api4_satd_pu[HAD_16x16]; /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ for(i = 0; i < 4; i++) { pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8; pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8; pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8; pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16); ihevce_had_8x8_using_4_4x4( pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4); } /* For SATD computation following TU size are assumed for a 16x16 CU */ /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs */ /* Update 8x8 SATDs */ /* Modified to cost calculation using only 4x4 SATD */ // ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; // ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7]; // ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13]; // ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; /* Update 16x16 SATDs */ pi4_sad_grid[PART_ID_2Nx2N] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0]; pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1]; pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2]; pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3]; /* Update 8x16 / 16x8 SATDs */ pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2]; pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3]; pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1]; pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3]; /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */ pi4_sad_grid[PART_ID_nLx2N_L] = ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10]; pi4_sad_grid[PART_ID_nRx2N_R] = ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15]; pi4_sad_grid[PART_ID_2NxnU_T] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; pi4_sad_grid[PART_ID_2NxnD_B] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L]; pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R]; pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T]; pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B]; /* Call the update results function */ { S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost; mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt; S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0]; S32 best_node_cost; S32 second_best_node_cost; /*For each valid partition, update the refine_prm structure to reflect the best and second best candidates for that partition*/ for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++) { S32 update_required = 0; S32 part_id = pi4_valid_part_ids[i4_count]; S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count; /* Use a pre-computed cost instead of freshly evaluating subpel cost */ i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; /*Calculate total cost*/ i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff); i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost); /*****************************************************************/ /* We do not labor through the results if the total cost worse */ /* than the last of the results. */ /*****************************************************************/ best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]); second_best_node_cost = SHRT_MAX; if(i4_tot_cost < second_best_node_cost) { update_required = 0; /*************************************************************/ /* Identify where the current result isto be placed.Basically*/ /* find the node which has cost just higher thannodeundertest*/ /*************************************************************/ if(i4_tot_cost < best_node_cost) { update_required = 1; } else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index]) { update_required = 0; } if(update_required == 2) { ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost; ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost; ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x; ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y; ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx; } else if(update_required == 1) { ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost; ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost; ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x; ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y; ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx; } } } } } WORD32 hme_evalsatd_pt_pu_16x16_tu_rec( err_prms_t *ps_prms, WORD32 lambda, WORD32 lambda_q_shift, WORD32 i4_frm_qstep, me_func_selector_t *ps_func_selector) { S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */ S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */ S32 ai4_tu_split_8x8[16]; S32 i4_satd_16x16; /* 16x16 satd cost */ S32 ai4_tu_early_cbf_8x8[16]; //S16 ai2_had_out[256]; S16 *pi2_had_out; S32 tu_split_flag = 0; S32 early_cbf_flag = 0; S32 total_satd_cost = 0; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *api4_tu_split[HAD_32x32 + 1]; S32 *api4_tu_early_cbf[HAD_32x32 + 1]; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; /* Initialize tu_split_cost to "0" */ ps_prms->i4_tu_split_cost = 0; pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; api4_satd_pu[HAD_16x16] = &i4_satd_16x16; api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ api4_tu_split[HAD_4x4] = NULL; api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0]; api4_tu_split[HAD_16x16] = &tu_split_flag; api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ api4_tu_early_cbf[HAD_4x4] = NULL; api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0]; api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag; api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ ps_func_selector->pf_had_16x16_r( pu1_inp, inp_stride, pu1_ref, ref_stride, pi2_had_out, 16, api4_satd_pu, api4_tu_split, api4_tu_early_cbf, 0, 4, lambda, lambda_q_shift, i4_frm_qstep, 0, ps_prms->u1_max_tr_depth, ps_prms->u1_max_tr_size, &(ps_prms->i4_tu_split_cost), NULL); total_satd_cost = i4_satd_16x16; ps_prms->pi4_tu_split_flags[0] = tu_split_flag; ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag; return total_satd_cost; } /** ******************************************************************************** * @fn S32 hme_evalsatd_pt_pu_32x32 * * @brief Evaluates the SATD with partial updates for all the best partitions * of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds * * @param[inout] ps_prms: error prms containg current and ref ptr, strides, * pointer to sad grid of each partitions * * @return None ******************************************************************************** */ void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms) { //S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */ S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */ S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */ S32 i4_satd_32x32; // S16 ai2_had_out[32*32]; U08 *pu1_src; U08 *pu1_pred; S32 i; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; //api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0]; api4_satd_pu[HAD_32x32] = &i4_satd_32x32; /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */ for(i = 0; i < 16; i++) { pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8); pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8); ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit( pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1); } /* Modified to cost calculation using only 8x8 SATD for 32x32*/ ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5]; ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7]; ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13]; ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15]; /* Update 32x32 SATD */ pi4_sad_grid[PART_ID_2Nx2N] = ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3]; /* Update 16x16 SATDs */ pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0]; pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1]; pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2]; pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3]; /* Update 16x32 / 32x16 SATDs */ pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2]; pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3]; pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1]; pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3]; /* Update AMP SATDs 32x24,32x8, 24x32,8x32 */ pi4_sad_grid[PART_ID_nLx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12]; pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] + ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R]; pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] + ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L]; pi4_sad_grid[PART_ID_nRx2N_R] = ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15]; pi4_sad_grid[PART_ID_2NxnU_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] + ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B]; pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] + ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T]; pi4_sad_grid[PART_ID_2NxnD_B] = ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15]; } WORD32 hme_evalsatd_pt_pu_32x32_tu_rec( err_prms_t *ps_prms, WORD32 lambda, WORD32 lambda_q_shift, WORD32 i4_frm_qstep, me_func_selector_t *ps_func_selector) { S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */ S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */ S32 ai4_tu_split_8x8[16]; S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */ S32 ai4_tu_split_16x16[4]; S32 i4_satd_32x32; S32 ai4_tu_early_cbf_8x8[16]; S32 ai4_tu_early_cbf_16x16[4]; S32 early_cbf_flag; S16 *pi2_had_out; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *api4_tu_split[HAD_32x32 + 1]; S32 *api4_tu_early_cbf[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags; S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf; S32 tu_split_flag = 0; S32 total_satd_cost = 0; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; /* Initialize tu_split_cost to "0" */ ps_prms->i4_tu_split_cost = 0; pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0]; api4_satd_pu[HAD_32x32] = &i4_satd_32x32; api4_tu_split[HAD_4x4] = NULL; api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0]; api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0]; api4_tu_split[HAD_32x32] = &tu_split_flag; api4_tu_early_cbf[HAD_4x4] = NULL; api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0]; api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0]; api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag; /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */ ihevce_had_32x32_r( pu1_inp, inp_stride, pu1_ref, ref_stride, pi2_had_out, 32, api4_satd_pu, api4_tu_split, api4_tu_early_cbf, 0, 8, lambda, lambda_q_shift, i4_frm_qstep, 0, ps_prms->u1_max_tr_depth, ps_prms->u1_max_tr_size, &(ps_prms->i4_tu_split_cost), ps_func_selector); total_satd_cost = i4_satd_32x32; /*The structure of the TU_SPLIT flag for the current 32x32 is as follows TL_16x16 - 5bits (4 for child and LSBit for 16x16 split) TR_16x16 - 5bits (4 for child and LSBit for 16x16 split) BL_16x16 - 5bits (4 for child and LSBit for 16x16 split) BR_16x16 - 5bits (4 for child and LSBit for 16x16 split) 32x32_split - 1bit (LSBit) TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/ pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost; pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag; pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag; return total_satd_cost; } /** ******************************************************************************** * @fn S32 hme_evalsatd_pt_pu_64x64 * * @brief Evaluates the SATD with partial updates for all the best partitions * of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds * * Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard * outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as * TU size of 64 is not supported in HEVC * * @param[inout] ps_prms: error prms containg current and ref ptr, strides, * pointer to sad grid of each partitions * * @return None ******************************************************************************** */ void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms) { //S32 ai4_satd_4x4[4][64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */ S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */ S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */ S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */ // S16 ai2_had_out[32*32]; S32 i, j; // S32 ai4_tu_split_8x8[4][16]; // S32 ai4_tu_split_16x16[4][4]; // S32 ai4_tu_split_32x32[4]; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; // S32 *api4_tu_split[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; U08 *pu1_src; U08 *pu1_pred; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; for(i = 0; i < 4; i++) { S32 blkx = (i & 0x1); S32 blky = (i >> 1); U08 *pu1_pi0, *pu1_pi1; //api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[i][0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0]; api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0]; api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i]; pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride); pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride); /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */ for(j = 0; j < 16; j++) { pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8); pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8); ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit( pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1); } /* Modified to cost calculation using only 8x8 SATD for 32x32*/ ai4_satd_16x16[i][0] = ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5]; ai4_satd_16x16[i][1] = ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7]; ai4_satd_16x16[i][2] = ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13]; ai4_satd_16x16[i][3] = ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15]; } /* Modified to cost calculation using only 8x8 SATD for 32x32*/ ai4_satd_32x32[0] = ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3]; ai4_satd_32x32[1] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3]; ai4_satd_32x32[2] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3]; ai4_satd_32x32[3] = ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3]; /* Update 64x64 SATDs */ pi4_sad_grid[PART_ID_2Nx2N] = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3]; /* Update 32x32 SATDs */ pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0]; pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1]; pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2]; pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3]; /* Update 32x64 / 64x32 SATDs */ pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2]; pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3]; pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1]; pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3]; /* Update AMP SATDs 64x48,64x16, 48x64,16x64 */ pi4_sad_grid[PART_ID_nLx2N_L] = ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2]; pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] + pi4_sad_grid[PART_ID_Nx2N_R]; pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] + ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] + pi4_sad_grid[PART_ID_Nx2N_L]; pi4_sad_grid[PART_ID_nRx2N_R] = ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3]; pi4_sad_grid[PART_ID_2NxnU_T] = ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1]; pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] + pi4_sad_grid[PART_ID_2NxN_B]; pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + pi4_sad_grid[PART_ID_2NxN_T]; pi4_sad_grid[PART_ID_2NxnD_B] = ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3]; } WORD32 hme_evalsatd_pt_pu_64x64_tu_rec( err_prms_t *ps_prms, WORD32 lambda, WORD32 lambda_q_shift, WORD32 i4_frm_qstep, me_func_selector_t *ps_func_selector) { S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */ S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */ S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */ S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */ S32 ai4_tu_split_8x8[16]; S32 ai4_tu_split_16x16[4]; S32 ai4_tu_early_cbf_8x8[16]; S32 ai4_tu_early_cbf_16x16[4]; S16 *pi2_had_out; S32 i; /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ S32 *api4_satd_pu[HAD_32x32 + 1]; S32 *api4_tu_split[HAD_32x32 + 1]; S32 *api4_tu_early_cbf[HAD_32x32 + 1]; S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; S32 tu_split_flag = 0; S32 total_satd_cost = 0; U08 *pu1_inp = ps_prms->pu1_inp; U08 *pu1_ref = ps_prms->pu1_ref; S32 inp_stride = ps_prms->i4_inp_stride; S32 ref_stride = ps_prms->i4_ref_stride; /* Initialize tu_split_cost to "0" */ ps_prms->i4_tu_split_cost = 0; pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; for(i = 0; i < 4; i++) { S32 blkx = (i & 0x1); S32 blky = (i >> 1); U08 *pu1_pi0, *pu1_pi1; tu_split_flag = 0; api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0]; api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i]; api4_tu_split[HAD_4x4] = NULL; api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0]; api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0]; api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i]; api4_tu_early_cbf[HAD_4x4] = NULL; api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0]; api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0]; api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i]; pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride); pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride); /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */ ihevce_had_32x32_r( pu1_pi0, inp_stride, pu1_pi1, ref_stride, pi2_had_out, 32, api4_satd_pu, api4_tu_split, api4_tu_early_cbf, 0, 8, lambda, lambda_q_shift, i4_frm_qstep, 1, ps_prms->u1_max_tr_depth, ps_prms->u1_max_tr_size, &(ps_prms->i4_tu_split_cost), ps_func_selector); } total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3]; /* Update 64x64 SATDs */ pi4_sad_grid[PART_ID_2Nx2N] = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3]; return total_satd_cost; } /** ******************************************************************************** * @fn void hme_subpel_refine_search_node(search_node_t *ps_search_node, * hme_subpel_prms_t *ps_prms, * layer_ctxt_t *ps_curr_layer, * BLK_SIZE_T e_blk_size, * S32 x_off, * S32 y_off) * * @brief Refines a given partition within a CU * * @param[in,out] ps_search_node: supplies starting mv and also ref id. * updated with the accurate subpel mv * * @param[in] ps_prms: subpel prms input to this function * * @param[in] ps_curr_layer : layer context * * @param[in] e_blk_size : Block size enumeration * * @param[in] x_off : x offset of the partition w.r.t. pic start * * @param[in] y_off : y offset of the partition w.r.t. pic start * * @return None ******************************************************************************** */ static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn( me_func_selector_t *ps_func_selector, ihevce_me_optimised_function_list_t *ps_me_optimised_function_list, S32 i4_part_mask, U08 u1_use_satd, U08 u1_num_parts, U08 u1_num_results) { PF_SAD_RESULT_FXN_T pf_err_compute; ASSERT((1 == u1_num_results) || (2 == u1_num_results)); if(1 == u1_num_results) { if(u1_use_satd) { if(u1_num_parts == 1) { pf_err_compute = ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1; } else if((u1_num_parts > 1) && (u1_num_parts <= 8)) { pf_err_compute = ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9; } else { pf_err_compute = ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17; } } else { if(u1_num_parts == 1) { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1; } else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5)) { pf_err_compute = ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts; } else if((u1_num_parts > 1) && (u1_num_parts <= 8)) { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9; } else { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17; } } } else { if(u1_use_satd) { if(u1_num_parts == 1) { pf_err_compute = ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1; } else if((u1_num_parts > 1) && (u1_num_parts <= 8)) { pf_err_compute = ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9; } else { pf_err_compute = ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17; } } else { if(u1_num_parts == 1) { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1; } else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5)) { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_2_best_results_subpel_square_parts; } else if((u1_num_parts > 1) && (u1_num_parts <= 8)) { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9; } else { pf_err_compute = ps_me_optimised_function_list ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17; } } } return pf_err_compute; } #if DIAMOND_GRID == 1 S32 hme_subpel_refine_search_node_high_speed( search_node_t *ps_search_node, hme_subpel_prms_t *ps_prms, layer_ctxt_t *ps_curr_layer, BLK_SIZE_T e_blk_size, S32 x_off, S32 y_off, search_results_t *ps_search_results, S32 pred_lx, S32 i4_part_mask, S32 *pi4_valid_part_ids, S32 search_idx, subpel_dedup_enabler_t *ps_dedup_enabler, me_func_selector_t *ps_func_selector, ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) { S32 i4_num_hpel_refine, i4_num_qpel_refine; S32 i4_offset, i4_grid_mask; S08 i1_ref_idx; S32 i4_blk_wd, i4_blk_ht; S32 i4_ref_stride, i4_i; pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx]; result_upd_prms_t s_result_prms; search_node_t s_temp_search_node; /*************************************************************************/ /* Tracks current MV with the fractional component. */ /*************************************************************************/ S32 i4_mv_x, i4_mv_y; S32 i4_frac_x, i4_frac_y; /*************************************************************************/ /* Function pointer for SAD/SATD, array and prms structure to pass to */ /* This function */ /*************************************************************************/ PF_SAD_RESULT_FXN_T pf_err_compute; S32 ai4_sad_grid[17], i4_tot_cost; err_prms_t s_err_prms; /*************************************************************************/ /* Allowed MV RANGE */ /*************************************************************************/ range_prms_t *ps_range_prms; /*************************************************************************/ /* stores min id in grid with associated min cost. */ /*************************************************************************/ S32 i4_min_cost, i4_min_sad; GRID_PT_T e_min_id; PF_INTERP_FXN_T pf_qpel_interp; /*************************************************************************/ /* For hpel and qpel we move in diamonds and hence each point in the */ /* diamond will belong to a completely different plane. To simplify the */ /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the */ /* hpel planes which are interpolated during recon. */ /*************************************************************************/ U08 *apu1_hpel_ref[4], *pu1_ref; interp_prms_t s_interp_prms; /*************************************************************************/ /* Maintains the minimum id of interpolated buffers, and the pointer that*/ /* points to the corresponding predicted buf with its stride. */ /* Note that the pointer cannot be derived just from the id, since the */ /* pointer may also point to the hpel buffer (in case we request interp */ /* of a hpel pt, which already exists in the recon hpel planes) */ /*************************************************************************/ U08 *pu1_final_out; S32 i4_final_out_stride; S32 part_id; S32 check_for_duplicate = 0; subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt; S32 mvx_qpel; S32 mvy_qpel; pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn( ps_func_selector, ps_me_optimised_function_list, i4_part_mask, ps_prms->i4_use_satd, ps_subpel_refine_ctxt->i4_num_valid_parts, ps_search_results->u1_num_results_per_part); i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine; i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine; /* Prediction contet should now deal with qpel units */ HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL); /* Buffer allocation for subpel */ /* Current design is that there may be many partitions and different mvs */ /* that attempt subpel refinemnt. While there is possibility of overlap, the */ /* hashing to detect and avoid overlap may be very complex. So, currently, */ /* the only thing done is to store the eventual predicted buffer with every */ /* ctb node that holds the result of hte best subpel search */ /* Compute the base pointer for input, interpolated buffers */ /* The base pointers point as follows: */ /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */ /* To these, we need to add the offset of the current node */ i4_ref_stride = ps_curr_layer->i4_rec_stride; i4_offset = x_off + (y_off * i4_ref_stride); i1_ref_idx = ps_search_node->i1_ref_idx; apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset; apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset; apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset; apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset; /* Initialize result params used for partition update */ s_result_prms.pf_mv_cost_compute = NULL; s_result_prms.ps_search_results = ps_search_results; s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids; s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx; s_result_prms.u1_pred_lx = search_idx; s_result_prms.i4_part_mask = i4_part_mask; s_result_prms.ps_search_node_base = ps_search_node; s_result_prms.pi4_sad_grid = &ai4_sad_grid[0]; s_result_prms.i4_grid_mask = 1; s_result_prms.ps_search_node = &s_temp_search_node; s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx; /* convert to hpel units */ i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1; i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1; /* for first pt, we compute at all locations in the grid, 4 + 1 centre */ ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx]; i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL); i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); i4_min_cost = MAX_32BIT_VAL; i4_min_sad = MAX_32BIT_VAL; /*************************************************************************/ /* Prepare the input params to SAD/SATD function. Note that input is */ /* passed from the calling funcion since it may be I (normal subpel */ /* refinement) or 2I - P0 in case of bidirect subpel refinement. */ /* Both cases are handled here. */ /*************************************************************************/ s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp; s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride; s_err_prms.i4_ref_stride = i4_ref_stride; s_err_prms.i4_part_mask = (ENABLE_2Nx2N); s_err_prms.i4_grid_mask = 1; s_err_prms.pi4_sad_grid = &ai4_sad_grid[0]; s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt; part_id = ps_search_node->u1_part_id; for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++) { e_min_id = PT_C; mvx_qpel = i4_mv_x << 1; mvy_qpel = i4_mv_y << 1; /* Central pt */ if(i4_grid_mask & BIT_EN(PT_C)) { //ps_search_node->i2_mv_x = (S16)i4_mv_x; //ps_search_node->i2_mv_x = (S16)i4_mv_y; /* central pt is i4_mv_x, i4_mv_y */ CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate); i4_frac_x = i4_mv_x & 1; i4_frac_y = i4_mv_y & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel; s_temp_search_node.s_mv.i2_mvx = mvx_qpel; s_temp_search_node.s_mv.i2_mvy = mvy_qpel; pf_err_compute(&s_err_prms, &s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_C; pu1_final_out = s_err_prms.pu1_ref; } } /* left pt */ if(i4_grid_mask & BIT_EN(PT_L)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1); ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); /* central pt is i4_mv_x - 1, i4_mv_y */ i4_frac_x = (i4_mv_x - 1) & 1; // same as (x-1)&1 i4_frac_y = i4_mv_y & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 2; s_result_prms.i2_mv_y = mvy_qpel; s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2; s_temp_search_node.s_mv.i2_mvy = mvy_qpel; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_L; pu1_final_out = s_err_prms.pu1_ref; } } } /* top pt */ if(i4_grid_mask & BIT_EN(PT_T)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1); ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1); /* top pt is i4_mv_x, i4_mv_y - 1 */ i4_frac_x = i4_mv_x & 1; i4_frac_y = (i4_mv_y - 1) & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel - 2; s_temp_search_node.s_mv.i2_mvx = mvx_qpel; s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_T; pu1_final_out = s_err_prms.pu1_ref; } } } /* right pt */ if(i4_grid_mask & BIT_EN(PT_R)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1); ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); /* right pt is i4_mv_x + 1, i4_mv_y */ i4_frac_x = (i4_mv_x + 1) & 1; i4_frac_y = i4_mv_y & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 2; s_result_prms.i2_mv_y = mvy_qpel; s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2; s_temp_search_node.s_mv.i2_mvy = mvy_qpel; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_R; pu1_final_out = s_err_prms.pu1_ref; } } } /* bottom pt */ if(i4_grid_mask & BIT_EN(PT_B)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1); ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1); i4_frac_x = i4_mv_x & 1; i4_frac_y = (i4_mv_y + 1) & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel + 2; s_temp_search_node.s_mv.i2_mvx = mvx_qpel; s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_B; pu1_final_out = s_err_prms.pu1_ref; } } } /* Early exit in case of central point */ if(e_min_id == PT_C) break; /*********************************************************************/ /* Depending on the best result location, we may be able to skip */ /* atleast two pts, centre pt and one more pt. E.g. if right pt is */ /* the best result, the next iteration need not do centre, left pts */ /*********************************************************************/ i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; i4_mv_x += gai1_grid_id_to_x[e_min_id]; i4_mv_y += gai1_grid_id_to_y[e_min_id]; ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); } /* Convert to QPEL units */ i4_mv_x <<= 1; i4_mv_y <<= 1; ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; /* Exact interpolation or averaging chosen here */ pf_qpel_interp = ps_prms->pf_qpel_interp; /* Next QPEL ME */ /* In this case, we have option of doing exact QPEL interpolation or avg */ /*************************************************************************/ /* x */ /* A b C d */ /* e f g h */ /* I j K l */ /* m n o p */ /* Q r S t */ /* */ /* Approximate QPEL logic */ /* b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K) */ /* for any given pt, we can get all the information required about */ /* the surrounding 4 pts. For example, given point C (0.5, 0) */ /* surrounding pts info: */ /* b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf */ /* buffer 2: hxfy, offsets for both are 0, 0 */ /* similarly for other pts the info can be gotten */ /*************************************************************************/ i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C)); i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); /*************************************************************************/ /* One time preparation of non changing interpolation params. These */ /* include a set of ping pong result buf ptrs, input buf ptrs and some */ /* working memory (not used though in case of averaging). */ /*************************************************************************/ s_interp_prms.ppu1_ref = &apu1_hpel_ref[0]; s_interp_prms.i4_ref_stride = i4_ref_stride; s_interp_prms.i4_blk_wd = i4_blk_wd; s_interp_prms.i4_blk_ht = i4_blk_ht; i4_final_out_stride = i4_ref_stride; { U08 *pu1_mem; /*********************************************************************/ /* Allocation of working memory for interpolated buffers. We maintain*/ /* an intermediate working buffer, and 2 ping pong interpolated out */ /* buffers, purpose of ping pong explained later below */ /*********************************************************************/ pu1_mem = ps_prms->pu1_wkg_mem; s_interp_prms.pu1_wkg_mem = pu1_mem; //pu1_mem += (INTERP_INTERMED_BUF_SIZE); s_interp_prms.apu1_interp_out[0] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[1] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[2] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[3] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[4] = pu1_mem; /*********************************************************************/ /* Stride of interpolated output is just a function of blk width of */ /* this partition and hence remains constant for this partition */ /*********************************************************************/ s_interp_prms.i4_out_stride = (i4_blk_wd); } { UWORD8 *apu1_final[4]; WORD32 ai4_ref_stride[4]; /*************************************************************************/ /* Ping pong design for interpolated buffers. We use a min id, which */ /* tracks the id of the ppu1_interp_out that stores the best result. */ /* When new interp to be done, it uses 1 - bes result id to do the interp*/ /* min id is toggled when any new result becomes the best result. */ /*************************************************************************/ for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++) { e_min_id = PT_C; mvx_qpel = i4_mv_x; mvy_qpel = i4_mv_y; hme_qpel_interp_comprehensive( &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask, ps_me_optimised_function_list); if(i4_grid_mask & BIT_EN(PT_L)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, num_unique_nodes, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; s_err_prms.pu1_ref = apu1_final[0]; s_err_prms.i4_ref_stride = ai4_ref_stride[0]; /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 1; s_result_prms.i2_mv_y = mvy_qpel; s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1; s_temp_search_node.s_mv.i2_mvy = mvy_qpel; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_L; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(i4_grid_mask & BIT_EN(PT_T)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, num_unique_nodes, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; s_err_prms.pu1_ref = apu1_final[1]; s_err_prms.i4_ref_stride = ai4_ref_stride[1]; /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel - 1; s_temp_search_node.s_mv.i2_mvx = mvx_qpel; s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_T; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(i4_grid_mask & BIT_EN(PT_R)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; s_err_prms.pu1_ref = apu1_final[2]; s_err_prms.i4_ref_stride = ai4_ref_stride[2]; /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 1; s_result_prms.i2_mv_y = mvy_qpel; s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1; s_temp_search_node.s_mv.i2_mvy = mvy_qpel; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_R; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } /* i4_mv_x and i4_mv_y will always be the centre pt */ /* for qpel we start with least hpel, and hence compute of center pt never reqd */ if(i4_grid_mask & BIT_EN(PT_B)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; s_err_prms.pu1_ref = apu1_final[3]; s_err_prms.i4_ref_stride = ai4_ref_stride[3]; /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel + 1; s_temp_search_node.s_mv.i2_mvx = mvx_qpel; s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1; pf_err_compute(&s_err_prms, &s_result_prms); //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_B; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } /* New QPEL mv x and y */ if(e_min_id == PT_C) break; i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; i4_mv_x += gai1_grid_id_to_x[e_min_id]; i4_mv_y += gai1_grid_id_to_y[e_min_id]; ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); } } /* update modified motion vectors and cost at end of subpel */ ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; ps_search_node->i4_tot_cost = i4_min_cost; ps_search_node->i4_sad = i4_min_sad; /********************************************************************************/ /* TODO: Restoring back Sad lambda from Hadamard lambda */ /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */ /********************************************************************************/ //ps_pred_ctxt->lambda >>= 1; return (i4_min_cost); } #elif DIAMOND_GRID == 0 S32 hme_subpel_refine_search_node_high_speed( search_node_t *ps_search_node, hme_subpel_prms_t *ps_prms, layer_ctxt_t *ps_curr_layer, BLK_SIZE_T e_blk_size, S32 x_off, S32 y_off, search_results_t *ps_search_results, S32 pred_lx, S32 i4_part_mask, S32 *pi4_valid_part_ids, S32 search_idx, subpel_dedup_enabler_t *ps_dedup_enabler, me_func_selector_t *ps_func_selector) { S32 i4_num_hpel_refine, i4_num_qpel_refine; S32 i4_offset, i4_grid_mask; S08 i1_ref_idx; S32 i4_blk_wd, i4_blk_ht; S32 i4_ref_stride, i4_i; pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx]; result_upd_prms_t s_result_prms; /*************************************************************************/ /* Tracks current MV with the fractional component. */ /*************************************************************************/ S32 i4_mv_x, i4_mv_y; S32 i4_frac_x, i4_frac_y; /*************************************************************************/ /* Function pointer for SAD/SATD, array and prms structure to pass to */ /* This function */ /*************************************************************************/ PF_SAD_FXN_T pf_err_compute; S32 ai4_sad_grid[9][17], i4_tot_cost; err_prms_t s_err_prms; /*************************************************************************/ /* Allowed MV RANGE */ /*************************************************************************/ range_prms_t *ps_range_prms; /*************************************************************************/ /* stores min id in grid with associated min cost. */ /*************************************************************************/ S32 i4_min_cost, i4_min_sad; GRID_PT_T e_min_id; PF_INTERP_FXN_T pf_qpel_interp; /*************************************************************************/ /* For hpel and qpel we move in diamonds and hence each point in the */ /* diamond will belong to a completely different plane. To simplify the */ /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the */ /* hpel planes which are interpolated during recon. */ /*************************************************************************/ U08 *apu1_hpel_ref[4], *pu1_ref; interp_prms_t s_interp_prms; /*************************************************************************/ /* Maintains the minimum id of interpolated buffers, and the pointer that*/ /* points to the corresponding predicted buf with its stride. */ /* Note that the pointer cannot be derived just from the id, since the */ /* pointer may also point to the hpel buffer (in case we request interp */ /* of a hpel pt, which already exists in the recon hpel planes) */ /*************************************************************************/ U08 *pu1_final_out; S32 i4_final_out_stride; S32 part_id; S32 check_for_duplicate = 0; S32 mvx_qpel; S32 mvy_qpel; /*************************************************************************/ /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/ /* fixed through this subpel refinement for this partition. */ /* Note, we do not enable grid sads since each pt is different buffers. */ /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled. */ /*************************************************************************/ if(ps_prms->i4_use_satd) { pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16; } else { pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */ } i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine; i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine; /* Prediction contet should now deal with qpel units */ HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL); /* Buffer allocation for subpel */ /* Current design is that there may be many partitions and different mvs */ /* that attempt subpel refinemnt. While there is possibility of overlap, the */ /* hashing to detect and avoid overlap may be very complex. So, currently, */ /* the only thing done is to store the eventual predicted buffer with every */ /* ctb node that holds the result of hte best subpel search */ /* Compute the base pointer for input, interpolated buffers */ /* The base pointers point as follows: /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */ /* To these, we need to add the offset of the current node */ i4_ref_stride = ps_curr_layer->i4_rec_stride; i4_offset = x_off + (y_off * i4_ref_stride); i1_ref_idx = ps_search_node->i1_ref_idx; apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset; apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset; apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset; apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset; /* Initialize result params used for partition update */ s_result_prms.pf_mv_cost_compute = NULL; s_result_prms.ps_search_results = ps_search_results; s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids; s_result_prms.i1_ref_idx = search_idx; s_result_prms.i4_part_mask = i4_part_mask; s_result_prms.ps_search_node_base = ps_search_node; s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0]; s_result_prms.i4_grid_mask = 1; /* convert to hpel units */ i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1; i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1; /* for first pt, we compute at all locations in the grid, 4 + 1 centre */ ps_range_prms = ps_prms->ps_mv_range_qpel; i4_grid_mask = (GRID_ALL_PTS_VALID); i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); i4_min_cost = MAX_32BIT_VAL; i4_min_sad = MAX_32BIT_VAL; /*************************************************************************/ /* Prepare the input params to SAD/SATD function. Note that input is */ /* passed from the calling funcion since it may be I (normal subpel */ /* refinement) or 2I - P0 in case of bidirect subpel refinement. */ /* Both cases are handled here. */ /*************************************************************************/ s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp; s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride; s_err_prms.i4_ref_stride = i4_ref_stride; s_err_prms.i4_part_mask = (ENABLE_2Nx2N); s_err_prms.i4_grid_mask = 1; s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0]; s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */ //ps_pred_ctxt->lambda <<= 1; part_id = ps_search_node->u1_part_id; for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++) { e_min_id = PT_C; mvx_qpel = i4_mv_x << 1; mvy_qpel = i4_mv_y << 1; /* Central pt */ if(i4_grid_mask & BIT_EN(PT_C)) { //ps_search_node->i2_mv_x = (S16)i4_mv_x; //ps_search_node->i2_mv_x = (S16)i4_mv_y; /* central pt is i4_mv_x, i4_mv_y */ CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate); i4_frac_x = i4_mv_x & 1; i4_frac_y = i4_mv_y & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_C; pu1_final_out = s_err_prms.pu1_ref; } } /* left pt */ if(i4_grid_mask & BIT_EN(PT_L)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1); ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); /* central pt is i4_mv_x - 1, i4_mv_y */ i4_frac_x = (i4_mv_x - 1) & 1; // same as (x-1)&1 i4_frac_y = i4_mv_y & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_L; pu1_final_out = s_err_prms.pu1_ref; } } } /* top pt */ if(i4_grid_mask & BIT_EN(PT_T)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1); ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1); /* top pt is i4_mv_x, i4_mv_y - 1 */ i4_frac_x = i4_mv_x & 1; i4_frac_y = (i4_mv_y - 1) & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel - 2; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_T; pu1_final_out = s_err_prms.pu1_ref; } } } /* right pt */ if(i4_grid_mask & BIT_EN(PT_R)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1); ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); /* right pt is i4_mv_x + 1, i4_mv_y */ i4_frac_x = (i4_mv_x + 1) & 1; i4_frac_y = i4_mv_y & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 2; s_result_prms.i2_mv_y = mvy_qpel; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_R; pu1_final_out = s_err_prms.pu1_ref; } } } /* bottom pt */ if(i4_grid_mask & BIT_EN(PT_B)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1); ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1); i4_frac_x = i4_mv_x & 1; i4_frac_y = (i4_mv_y + 1) & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel + 2; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_B; pu1_final_out = s_err_prms.pu1_ref; } } } if(e_min_id == PT_C) { if(!i4_i) { /* TL pt */ if(i4_grid_mask & BIT_EN(PT_TL)) { S32 mvx_minus_1 = (i4_mv_x - 1); S32 mvy_minus_1 = (i4_mv_y - 1); CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1); ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1); i4_frac_x = mvx_minus_1 & 1; i4_frac_y = mvy_minus_1 & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 2; s_result_prms.i2_mv_y = mvy_qpel - 2; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_TL; pu1_final_out = s_err_prms.pu1_ref; } } } /* TR pt */ if(i4_grid_mask & BIT_EN(PT_TR)) { S32 mvx_plus_1 = (i4_mv_x + 1); S32 mvy_minus_1 = (i4_mv_y - 1); CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1); ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1); i4_frac_x = mvx_plus_1 & 1; i4_frac_y = mvy_minus_1 & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 2; s_result_prms.i2_mv_y = mvy_qpel - 2; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_TR; pu1_final_out = s_err_prms.pu1_ref; } } } /* BL pt */ if(i4_grid_mask & BIT_EN(PT_BL)) { S32 mvx_minus_1 = (i4_mv_x - 1); S32 mvy_plus_1 = (i4_mv_y + 1); CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1); ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1); i4_frac_x = mvx_minus_1 & 1; i4_frac_y = mvy_plus_1 & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 2; s_result_prms.i2_mv_y = mvy_qpel + 2; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_BL; pu1_final_out = s_err_prms.pu1_ref; } } } /* BR pt */ if(i4_grid_mask & BIT_EN(PT_BR)) { S32 mvx_plus_1 = (i4_mv_x + 1); S32 mvy_plus_1 = (i4_mv_y + 1); CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate); if(!check_for_duplicate) { /* search node mv is stored in qpel units */ ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1); ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1); i4_frac_x = mvx_plus_1 & 1; i4_frac_y = mvy_plus_1 & 1; pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; s_err_prms.pu1_ref = pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride); pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 2; s_result_prms.i2_mv_y = mvy_qpel + 2; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; e_min_id = PT_BR; pu1_final_out = s_err_prms.pu1_ref; } } } if(e_min_id == PT_C) { break; } } else { break; } } /*********************************************************************/ /* Depending on the best result location, we may be able to skip */ /* atleast two pts, centre pt and one more pt. E.g. if right pt is */ /* the best result, the next iteration need not do centre, left pts */ /*********************************************************************/ if(i4_i) { i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; } else { i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id]; } i4_mv_x += gai1_grid_id_to_x[e_min_id]; i4_mv_y += gai1_grid_id_to_y[e_min_id]; ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1); ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); } /* Convert to QPEL units */ i4_mv_x <<= 1; i4_mv_y <<= 1; ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; /* Early exit if this partition is visiting same hpel mv again */ /* Assumption : Checkin for early exit in best result of partition */ if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x == ps_search_node->s_mv.i2_mvx) && (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y == ps_search_node->s_mv.i2_mvy)) { return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost); } else { /* Store the best hpel mv for future early exit checks */ ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x = (S16)i4_mv_x; ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y = (S16)i4_mv_y; } /* Early exit if this partition is visiting same hpel mv again */ /* Assumption : Checkin for early exit in second best result of partition */ if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x == ps_search_node->s_mv.i2_mvx) && (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y == ps_search_node->s_mv.i2_mvy)) { return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost); } else { /* Store the best hpel mv for future early exit checks */ ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x = (S16)i4_mv_x; ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y = (S16)i4_mv_y; } /* Exact interpolation or averaging chosen here */ pf_qpel_interp = ps_prms->pf_qpel_interp; /* Next QPEL ME */ /* In this case, we have option of doing exact QPEL interpolation or avg */ /*************************************************************************/ /* x */ /* A b C d */ /* e f g h */ /* I j K l */ /* m n o p */ /* Q r S t */ /* */ /* Approximate QPEL logic */ /* b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K) */ /* for any given pt, we can get all the information required about */ /* the surrounding 4 pts. For example, given point C (0.5, 0) */ /* surrounding pts info: */ /* b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf */ /* buffer 2: hxfy, offsets for both are 0, 0 */ /* similarly for other pts the info can be gotten */ /*************************************************************************/ i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C)); i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); /*************************************************************************/ /* One time preparation of non changing interpolation params. These */ /* include a set of ping pong result buf ptrs, input buf ptrs and some */ /* working memory (not used though in case of averaging). */ /*************************************************************************/ s_interp_prms.ppu1_ref = &apu1_hpel_ref[0]; s_interp_prms.i4_ref_stride = i4_ref_stride; s_interp_prms.i4_blk_wd = i4_blk_wd; s_interp_prms.i4_blk_ht = i4_blk_ht; i4_final_out_stride = i4_ref_stride; { U08 *pu1_mem; /*********************************************************************/ /* Allocation of working memory for interpolated buffers. We maintain*/ /* an intermediate working buffer, and 2 ping pong interpolated out */ /* buffers, purpose of ping pong explained later below */ /*********************************************************************/ pu1_mem = ps_prms->pu1_wkg_mem; s_interp_prms.pu1_wkg_mem = pu1_mem; //pu1_mem += (INTERP_INTERMED_BUF_SIZE); s_interp_prms.apu1_interp_out[0] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[1] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[2] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[3] = pu1_mem; pu1_mem += (INTERP_OUT_BUF_SIZE); s_interp_prms.apu1_interp_out[4] = pu1_mem; /*********************************************************************/ /* Stride of interpolated output is just a function of blk width of */ /* this partition and hence remains constant for this partition */ /*********************************************************************/ s_interp_prms.i4_out_stride = (i4_blk_wd); } { UWORD8 *apu1_final[4]; WORD32 ai4_ref_stride[4]; /*************************************************************************/ /* Ping pong design for interpolated buffers. We use a min id, which */ /* tracks the id of the ppu1_interp_out that stores the best result. */ /* When new interp to be done, it uses 1 - bes result id to do the interp*/ /* min id is toggled when any new result becomes the best result. */ /*************************************************************************/ for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++) { e_min_id = PT_C; hme_qpel_interp_comprehensive( &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask); mvx_qpel = i4_mv_x; mvy_qpel = i4_mv_y; if(i4_grid_mask & BIT_EN(PT_L)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; s_err_prms.pu1_ref = apu1_final[0]; s_err_prms.i4_ref_stride = ai4_ref_stride[0]; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 1; s_result_prms.i2_mv_y = mvy_qpel; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_L; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(i4_grid_mask & BIT_EN(PT_T)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; s_err_prms.pu1_ref = apu1_final[1]; s_err_prms.i4_ref_stride = ai4_ref_stride[1]; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel - 1; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_T; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(i4_grid_mask & BIT_EN(PT_R)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; s_err_prms.pu1_ref = apu1_final[2]; s_err_prms.i4_ref_stride = ai4_ref_stride[2]; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 1; s_result_prms.i2_mv_y = mvy_qpel; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_R; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } /* i4_mv_x and i4_mv_y will always be the centre pt */ /* for qpel we start with least hpel, and hence compute of center pt never reqd */ if(i4_grid_mask & BIT_EN(PT_B)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; s_err_prms.pu1_ref = apu1_final[3]; s_err_prms.i4_ref_stride = ai4_ref_stride[3]; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel; s_result_prms.i2_mv_y = mvy_qpel + 1; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_B; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(e_min_id == PT_C) { if(!i4_i) { S32 i4_interp_buf_id = 0; if(i4_grid_mask & BIT_EN(PT_TL)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; /* Carry out the interpolation */ pf_qpel_interp( &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id); s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 1; s_result_prms.i2_mv_y = mvy_qpel - 1; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_TL; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(i4_grid_mask & BIT_EN(PT_TR)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; /* Carry out the interpolation */ pf_qpel_interp( &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id); s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 1; s_result_prms.i2_mv_y = mvy_qpel - 1; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_TR; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(i4_grid_mask & BIT_EN(PT_BL)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; /* Carry out the interpolation */ pf_qpel_interp( &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id); s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel - 1; s_result_prms.i2_mv_y = mvy_qpel + 1; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_BL; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } /* i4_mv_x and i4_mv_y will always be the centre pt */ /* for qpel we start with least hpel, and hence compute of center pt never reqd */ if(i4_grid_mask & BIT_EN(PT_BR)) { CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate); if(!check_for_duplicate) { ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; /* Carry out the interpolation */ pf_qpel_interp( &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id); s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; pf_err_compute(&s_err_prms); /* Update the mv's with the current candt motion vectors */ s_result_prms.i2_mv_x = mvx_qpel + 1; s_result_prms.i2_mv_y = mvy_qpel + 1; hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; if(i4_tot_cost < i4_min_cost) { e_min_id = PT_BR; i4_min_cost = i4_tot_cost; i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; } } } if(e_min_id == PT_C) { break; } } else { break; } } if(i4_i) { i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; } else { i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id]; } i4_mv_x += gai1_grid_id_to_x[e_min_id]; i4_mv_y += gai1_grid_id_to_y[e_min_id]; ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); } } /* update modified motion vectors and cost at end of subpel */ ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; ps_search_node->i4_tot_cost = i4_min_cost; ps_search_node->i4_sad = i4_min_sad; /********************************************************************************/ /* TODO: Restoring back Sad lambda from Hadamard lambda */ /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */ /********************************************************************************/ //ps_pred_ctxt->lambda >>= 1; return (i4_min_cost); } #endif static void hme_subpel_refine_struct_to_search_results_struct_converter( subpel_refine_ctxt_t *ps_subpel_refine_ctxt, search_results_t *ps_search_results, U08 u1_pred_dir, ME_QUALITY_PRESETS_T e_quality_preset) { U08 i; U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part; for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++) { S32 index; S32 i4_sad; S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i]; search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id]; if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8) { index = part_id; } else { index = i; } if(!ps_best_node->u1_subpel_done) { i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].i4_sdi = 0; ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1); ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[0].i4_sad = i4_sad; ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; ps_best_node->u1_subpel_done = 1; if(2 == u1_num_results_per_part) { i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] - ps_subpel_refine_ctxt->i2_mv_cost[1][index]; ps_best_node[1].i4_sdi = 0; ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[1].i4_sad = i4_sad; ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index]; ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index]; ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index]; ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index]; ps_best_node[1].u1_subpel_done = 1; } } else if( (2 == u1_num_results_per_part) && (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost)) { if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost) { i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].i4_sdi = 0; ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[0].i4_sad = i4_sad; ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] - ps_subpel_refine_ctxt->i2_mv_cost[1][index]; ps_best_node[1].i4_sdi = 0; ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[1].i4_sad = i4_sad; ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index]; ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index]; ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index]; ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index]; } else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost) { if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost) { i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[1].i4_sdi = 0; ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[1].i4_sad = i4_sad; ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; } else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost) { memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t)); i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].i4_sdi = 0; ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[0].i4_sad = i4_sad; ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; } } } else if( (1 == u1_num_results_per_part) && (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)) { i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].i4_sdi = 0; ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) { i4_sad = MAX_SIGNED_16BIT_VAL; } ps_best_node[0].i4_sad = i4_sad; ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; } } } /** ******************************************************************************** * @fn S32 hme_subpel_refine_cu_hs * * @brief Evaluates the best subpel mvs for active partitions of an MB in L0 * layer for the high speed preset. Recursive hadamard SATD / SAD * and mv cost is used for 2NxN and NxN partitions with active partition * update * * @param[in] ps_prms: subpel prms input to this function * * @param[in] ps_curr_layer: points to the current layer ctxt * * @param[out] ps_search_results: points to the search resutls that get updated * with best results * * @param[in] search_idx: ref id of the frame for which results get updated * * @param[in] ps_wt_inp_prms: current frame input params * * @return None ******************************************************************************** */ void hme_subpel_refine_cu_hs( hme_subpel_prms_t *ps_prms, layer_ctxt_t *ps_curr_layer, search_results_t *ps_search_results, S32 search_idx, wgt_pred_ctxt_t *ps_wt_inp_prms, WORD32 blk_8x8_mask, me_func_selector_t *ps_func_selector, ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list, ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) { /* Unique search node list for 2nx2n and nxn partitions */ search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5]; subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF]; search_node_t *ps_search_node; S32 i, i4_part_mask, j; S32 i4_sad_grid; S32 max_subpel_cand; WORD32 index; S32 num_unique_nodes_2nx2n; S32 part_id; S32 x_off, y_off; S32 i4_inp_off; CU_SIZE_T e_cu_size; BLK_SIZE_T e_blk_size; subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt; S32 i4_use_satd = ps_prms->i4_use_satd; S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1; ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART); if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy) { e_cu_size = ps_search_results->e_cu_size; i4_part_mask = ps_search_results->i4_part_mask; ps_prms->i4_inp_type = sizeof(U08); num_unique_nodes_2nx2n = 0; for(i = 0; i < i4_num_act_refs; i++) { as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF; } /************************************************************************/ /* */ /* Initialize SATD cost for each valid partition id.one time before */ /* doing full pel time. This is because of the following reasons: */ /* 1. Full pel cost was done in SAD while subpel is in SATD mode */ /* 2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */ /* doing Diamond search for 2Nx2N and NxN. This partitions are */ /* not explicitly refine in high speed mode */ /* */ /************************************************************************/ for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++) { S32 enable_subpel = 0; S32 part_type; /* Derive the x and y offsets of this part id */ part_id = ps_subpel_refine_ctxt->ai4_part_id[i]; if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8) { index = part_id; } else { index = i; } part_type = ge_part_id_to_part_type[part_id]; x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size; y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size; x_off += ps_search_results->u1_x_off; y_off += ps_search_results->u1_y_off; i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride; e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id]; x_off += ps_prms->i4_ctb_x_off; y_off += ps_prms->i4_ctb_y_off; max_subpel_cand = 0; /* Choose the minimum number of candidates to be used for Sub pel refinement */ if(PART_ID_2Nx2N == part_type) { max_subpel_cand = MIN(ps_prms->u1_max_subpel_candts_2Nx2N, ps_search_results->u1_num_results_per_part); } else if(PRT_NxN == part_type) { max_subpel_cand = MIN( ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part); } /* If incomplete CTB, NxN num candidates should be forced to min 1 */ if((0 == max_subpel_cand) && (blk_8x8_mask != 15)) { max_subpel_cand = 1; } if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type)) { enable_subpel = 1; } /* Compute full pel SATD for each result per partition before subpel */ /* refinement starts. */ /* Also prepare unique candidate list for 2Nx2N and NxN partitions */ for(j = 0; j < ps_search_results->u1_num_results_per_part; j++) { err_prms_t s_err_prms; S32 i4_satd = 0; S32 i1_ref_idx; U08 *pu1_ref_base; S32 i4_ref_stride = ps_curr_layer->i4_rec_stride; S32 i4_mv_x, i4_mv_y; ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j; if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV) { ps_search_node->u1_subpel_done = 1; continue; } i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index]; ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off); pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx]; i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index]; i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index]; if(i4_use_satd) { s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp; s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride; s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x + (i4_mv_y * i4_ref_stride); s_err_prms.i4_ref_stride = i4_ref_stride; s_err_prms.i4_part_mask = (ENABLE_2Nx2N); s_err_prms.i4_grid_mask = 1; s_err_prms.pi4_sad_grid = &i4_sad_grid; s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list; compute_satd_8bit(&s_err_prms); i4_satd = s_err_prms.pi4_sad_grid[0]; ps_subpel_refine_ctxt->i2_tot_cost[j][index] = CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd); ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd; } /* Sub-pel candidate filtration */ if(j) { S16 i2_best_sad; S32 i4_best_mvx; S32 i4_best_mvy; search_node_t *ps_node = ps_search_results->aps_part_results[search_idx][part_id]; U08 u1_is_subpel_done = ps_node->u1_subpel_done; S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index]; S32 i4_curr_mvx = i4_mv_x << 2; S32 i4_curr_mvy = i4_mv_y << 2; if(u1_is_subpel_done) { i2_best_sad = ps_node->i4_sad; if(ps_node->i1_ref_idx == i1_ref_idx) { i4_best_mvx = ps_node->s_mv.i2_mvx; i4_best_mvy = ps_node->s_mv.i2_mvy; } else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index]) { i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; } else { i4_best_mvx = INTRA_MV; i4_best_mvy = INTRA_MV; } } else { i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - ps_subpel_refine_ctxt->i2_mv_cost[0][index]; if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index]) { i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; } else { i4_best_mvx = INTRA_MV; i4_best_mvy = INTRA_MV; } } i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold); if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) && (ABS(i4_curr_mvy - i4_best_mvy) < 2)) || (i2_curr_sad > i2_best_sad)) { enable_subpel = 0; } } ps_search_node->u1_part_id = part_id; /* Convert mvs in part results from FPEL to QPEL units */ ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2; ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2; /* If the candidate number is more than the number of candts set initally, do not add those candts for refinement */ if(j >= max_subpel_cand) { enable_subpel = 0; } if(enable_subpel) { if(num_unique_nodes_2nx2n == 0) { S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index]; as_subpel_dedup_enabler[i4_index].i2_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index]; as_subpel_dedup_enabler[i4_index].i2_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index]; as_subpel_dedup_enabler[i4_index].u1_ref_idx = (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index]; memset( as_subpel_dedup_enabler[i4_index].au4_node_map, 0, sizeof(U32) * 2 * MAP_X_MAX); } INSERT_NEW_NODE_NOMAP_ALTERNATE( as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i); } } /*********************************************************************************************/ /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */ /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/ /* for each partition again, based on the new costs */ /*********************************************************************************************/ /*********************************************************************************************/ /* Because right now, we store only the two best candidates for each partition, the sort will*/ /* converge to a simple swap. */ /* ASSUMPTION : We store only two best results per partition */ /*********************************************************************************************/ if(ps_search_results->u1_num_results_per_part == 2) { if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] > ps_subpel_refine_ctxt->i2_tot_cost[1][index]) { SWAP( ps_subpel_refine_ctxt->i2_tot_cost[0][index], ps_subpel_refine_ctxt->i2_tot_cost[1][index]); SWAP( ps_subpel_refine_ctxt->i2_mv_cost[0][index], ps_subpel_refine_ctxt->i2_mv_cost[1][index]); SWAP( ps_subpel_refine_ctxt->i2_mv_x[0][index], ps_subpel_refine_ctxt->i2_mv_x[1][index]); SWAP( ps_subpel_refine_ctxt->i2_mv_y[0][index], ps_subpel_refine_ctxt->i2_mv_y[1][index]); SWAP( ps_subpel_refine_ctxt->i2_ref_idx[0][index], ps_subpel_refine_ctxt->i2_ref_idx[1][index]); SWAP( ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index], ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]); } } } if(blk_8x8_mask == 0xf) { num_unique_nodes_2nx2n = MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers); } { x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size; y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size; x_off += ps_search_results->u1_x_off; y_off += ps_search_results->u1_y_off; i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride; e_blk_size = ge_part_id_to_blk_size[e_cu_size][0]; for(j = 0; j < num_unique_nodes_2nx2n; j++) { S32 pred_lx; ps_search_node = &as_nodes_2nx2n[j]; if(ps_search_node->s_mv.i2_mvx == INTRA_MV) { continue; } { S08 i1_ref_idx = ps_search_node->i1_ref_idx; subpel_dedup_enabler_t *ps_dedup_enabler = &(as_subpel_dedup_enabler[i1_ref_idx]); if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF) { as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx; as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy; as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx; memset( as_subpel_dedup_enabler[i1_ref_idx].au4_node_map, 0, sizeof(U32) * 2 * MAP_X_MAX); } } pred_lx = search_idx; ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off); hme_subpel_refine_search_node_high_speed( ps_search_node, ps_prms, ps_curr_layer, e_blk_size, x_off + ps_prms->i4_ctb_x_off, y_off + ps_prms->i4_ctb_y_off, ps_search_results, pred_lx, i4_part_mask, &ps_subpel_refine_ctxt->ai4_part_id[0], search_idx, &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]), ps_func_selector, ps_me_optimised_function_list); } } } else { for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++) { S32 i4_index; S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i]; if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8) { i4_index = i4_part_id; } else { i4_index = i; } for(j = 0; j < ps_search_results->u1_num_results_per_part; j++) { ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2; ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2; } } } hme_subpel_refine_struct_to_search_results_struct_converter( ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets); }