diff options
Diffstat (limited to 'third_party/libaom/source/libaom/av1/encoder/ethread.c')
-rw-r--r-- | third_party/libaom/source/libaom/av1/encoder/ethread.c | 528 |
1 files changed, 465 insertions, 63 deletions
diff --git a/third_party/libaom/source/libaom/av1/encoder/ethread.c b/third_party/libaom/source/libaom/av1/encoder/ethread.c index 3735ca3c8b..d274b6b84f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/ethread.c +++ b/third_party/libaom/source/libaom/av1/encoder/ethread.c @@ -11,9 +11,11 @@ #include "av1/common/warped_motion.h" +#include "av1/encoder/bitstream.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/ethread.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/firstpass.h" @@ -52,7 +54,7 @@ static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &cpi->td.mb.e_mbd; - const int mib_size = cm->seq_params.mib_size; + const int mib_size = cm->seq_params->mib_size; const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int row = 0; row < cm->tiles.rows; row++) { @@ -68,7 +70,8 @@ static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; MB_MODE_INFO *mbmi = mi[0]; - if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params.sb_size)) { + if (mbmi->skip_txfm == 1 && + (mbmi->bsize == cm->seq_params->sb_size)) { for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; mbmi->delta_lf_from_base = xd->delta_lf_from_base; @@ -362,7 +365,7 @@ static AOM_INLINE void switch_tile_and_get_next_job( *cur_tile_id = tile_id; const int unit_height = mi_size_high[fp_block_size]; get_next_job(&tile_data[tile_id], current_mi_row, - is_firstpass ? unit_height : cm->seq_params.mib_size); + is_firstpass ? unit_height : cm->seq_params->mib_size); } } @@ -441,13 +444,20 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { const BLOCK_SIZE fp_block_size = cpi->fp_block_size; int end_of_frame = 0; + + // When master thread does not have a valid job to process, xd->tile_ctx + // is not set and it contains NULL pointer. This can result in NULL pointer + // access violation if accessed beyond the encode stage. Hence, updating + // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame + // context to avoid NULL pointer access in subsequent stages. + thread_data->td->mb.e_mbd.tile_ctx = cm->fc; while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif if (!get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, - cm->seq_params.mib_size)) { + cm->seq_params->mib_size)) { // No jobs are available for the current tile. Query for the status of // other tiles and get the next job if available switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, @@ -470,6 +480,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { td->mb.e_mbd.tile_ctx = td->tctx; td->mb.tile_pb_ctx = &this_tile->tctx; + td->abs_sum_level = 0; if (this_tile->allow_update_cdf) { td->mb.row_ctx = this_tile->row_ctx; @@ -482,7 +493,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, &td->mb.e_mbd); - cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); if (td->mb.txfm_search_info.txb_rd_records != NULL) { av1_crc32c_calculator_init( &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator); @@ -492,6 +503,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif + this_tile->abs_sum_level += td->abs_sum_level; row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex_); @@ -526,16 +538,12 @@ static int enc_worker_hook(void *arg1, void *unused) { return 1; } -void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { AV1_COMMON *const cm = &cpi->common; - const AVxWorkerInterface *const winterface = aom_get_worker_interface(); MultiThreadInfo *const mt_info = &cpi->mt_info; - assert(mt_info->workers != NULL); - assert(mt_info->tile_thr_data != NULL); - -#if CONFIG_MULTITHREAD - if (cpi->oxcf.row_mt == 1) { + if (is_first_pass || cpi->oxcf.row_mt == 1) { AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; if (enc_row_mt->mutex_ == NULL) { CHECK_MEM_ERROR(cm, enc_row_mt->mutex_, @@ -543,24 +551,39 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL); } } - AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; - if (gm_sync->mutex_ == NULL) { - CHECK_MEM_ERROR(cm, gm_sync->mutex_, - aom_malloc(sizeof(*(gm_sync->mutex_)))); - if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); - } - AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; - if (tf_sync->mutex_ == NULL) { - CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_))); - if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); - } - AV1CdefSync *cdef_sync = &mt_info->cdef_sync; - if (cdef_sync->mutex_ == NULL) { - CHECK_MEM_ERROR(cm, cdef_sync->mutex_, - aom_malloc(sizeof(*(cdef_sync->mutex_)))); - if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + + if (!is_first_pass) { + AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; + if (gm_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, gm_sync->mutex_, + aom_malloc(sizeof(*(gm_sync->mutex_)))); + if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); + } +#if !CONFIG_REALTIME_ONLY + AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; + if (tf_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, tf_sync->mutex_, + aom_malloc(sizeof(*tf_sync->mutex_))); + if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); + } +#endif // !CONFIG_REALTIME_ONLY + AV1CdefSync *cdef_sync = &mt_info->cdef_sync; + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } } -#endif +} +#endif // CONFIG_MULTITHREAD + +void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { + AV1_COMMON *const cm = &cpi->common; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + MultiThreadInfo *const mt_info = &cpi->mt_info; + + assert(mt_info->workers != NULL); + assert(mt_info->tile_thr_data != NULL); for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; @@ -576,7 +599,7 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { // Create threads if (!winterface->reset(worker)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Tile encoder thread creation failed"); } else { // Main thread acts as a worker and uses the thread data in cpi. @@ -625,10 +648,6 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) { alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer); - CHECK_MEM_ERROR( - cm, thread_data->td->tmp_conv_dst, - aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * - sizeof(*thread_data->td->tmp_conv_dst))); for (int j = 0; j < 2; ++j) { CHECK_MEM_ERROR( cm, thread_data->td->tmp_pred_bufs[j], @@ -636,9 +655,14 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) { sizeof(*thread_data->td->tmp_pred_bufs[j]))); } + const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome; + CHECK_MEM_ERROR(cm, thread_data->td->pixel_gradient_info, + aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) * + plane_types * MAX_SB_SQUARE)); + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { const int num_64x64_blocks = - (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4; CHECK_MEM_ERROR( cm, thread_data->td->vt64x64, aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks)); @@ -680,6 +704,10 @@ void av1_create_workers(AV1_COMP *cpi, int num_workers) { // Set up shared coeff buffers. av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf); + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->td->tmp_conv_dst))); } ++mt_info->num_workers; } @@ -724,7 +752,7 @@ static AOM_INLINE void fp_create_enc_workers(AV1_COMP *cpi, int num_workers) { if (create_workers) { // Create threads if (!winterface->reset(worker)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Tile encoder thread creation failed"); } } else { @@ -764,7 +792,7 @@ static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info, } if (had_error) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to encode tile data"); } @@ -780,14 +808,15 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi, !frame_is_intra_only(&cpi->common)) av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, &thread_data->td->mb); - if (thread_data->td->mb.txfm_search_info.txb_rd_records) { - aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records); - thread_data->td->mb.txfm_search_info.txb_rd_records = NULL; - } - if (thread_data->td != &cpi->td && - cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { - aom_free(thread_data->td->mb.mv_costs); + if (thread_data->td != &cpi->td) { + if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.mv_costs); + } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.dv_costs); + } } + av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb); // Accumulate counters. if (i > 0) { @@ -822,6 +851,7 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, thread_data->td->intrabc_used = 0; thread_data->td->deltaq_used = 0; + thread_data->td->abs_sum_level = 0; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -846,15 +876,19 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, sizeof(MvCosts)); } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs, + (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts))); + memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, + sizeof(IntraBCMVCosts)); + } } + av1_alloc_mb_data(cm, &thread_data->td->mb, + cpi->sf.rt_sf.use_nonrd_pick_mode); + // Reset cyclic refresh counters. av1_init_cyclic_refresh_counters(&thread_data->td->mb); - if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { - CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records, - (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords))); - } - if (thread_data->td->counts != &cpi->counts) { memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } @@ -867,6 +901,8 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, thread_data->td->mb.tmp_pred_bufs[j] = thread_data->td->tmp_pred_bufs[j]; } + thread_data->td->mb.pixel_gradient_info = + thread_data->td->pixel_gradient_info; thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; for (int j = 0; j < 2; ++j) { @@ -904,11 +940,16 @@ static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, sizeof(MvCosts)); } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs, + (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts))); + memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, + sizeof(IntraBCMVCosts)); + } } - if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { - CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records, - (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords))); - } + + av1_alloc_mb_data(cm, &thread_data->td->mb, + cpi->sf.rt_sf.use_nonrd_pick_mode); } } #endif @@ -1191,13 +1232,15 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) { sync_enc_workers(&cpi->mt_info, cm, num_workers); for (int i = num_workers - 1; i >= 0; i--) { EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i]; - if (thread_data->td != &cpi->td && - cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { - aom_free(thread_data->td->mb.mv_costs); - } - if (thread_data->td->mb.txfm_search_info.txb_rd_records) { - aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records); + if (thread_data->td != &cpi->td) { + if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.mv_costs); + } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.dv_costs); + } } + av1_dealloc_mb_data(cm, &thread_data->td->mb); } } @@ -1277,11 +1320,15 @@ static int tpl_worker_hook(void *arg1, void *unused) { AV1_COMMON *cm = &cpi->common; MACROBLOCK *x = &thread_data->td->mb; MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats; CommonModeInfoParams *mi_params = &cm->mi_params; - BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d); + BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); TX_SIZE tx_size = max_txsize_lookup[bsize]; int mi_height = mi_size_high[bsize]; - int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working; + int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); + for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows; mi_row += num_active_workers * mi_height) { // Motion estimation row boundary @@ -1290,7 +1337,7 @@ static int tpl_worker_hook(void *arg1, void *unused) { xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); - av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size); + av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size); } return 1; } @@ -1370,6 +1417,24 @@ static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook, // OBMC buffers are used only to init MS params and remain unused when // called from tpl, hence set the buffers to defaults. av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + } + } +} + +// Accumulate transform stats after tpl. +static void tpl_accumulate_txfm_stats(ThreadData *main_td, + const MultiThreadInfo *mt_info, + int num_workers) { + TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + ThreadData *td = thread_data->td; + if (td != main_td) { + const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; + av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats); } } } @@ -1379,7 +1444,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CommonModeInfoParams *mi_params = &cm->mi_params; MultiThreadInfo *mt_info = &cpi->mt_info; - TplParams *tpl_data = &cpi->tpl_data; + TplParams *tpl_data = &cpi->ppi->tpl_data; AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync; int mb_rows = mi_params->mb_rows; int num_workers = @@ -1398,6 +1463,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { prepare_tpl_workers(cpi, tpl_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, cm, num_workers); + tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers); } // Deallocate memory for temporal filter multi-thread synchronization. @@ -1752,6 +1818,331 @@ void av1_global_motion_estimation_mt(AV1_COMP *cpi) { } #endif // !CONFIG_REALTIME_ONLY +// Compare and order tiles based on absolute sum of tx coeffs. +static int compare_tile_order(const void *a, const void *b) { + const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a; + const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b; + + if (tile_a->abs_sum_level > tile_b->abs_sum_level) + return -1; + else if (tile_a->abs_sum_level == tile_b->abs_sum_level) + return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1); + else + return 1; +} + +// Get next tile index to be processed for pack bitstream +static AOM_INLINE int get_next_pack_bs_tile_idx( + AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { + assert(pack_bs_sync->next_job_idx <= num_tiles); + if (pack_bs_sync->next_job_idx == num_tiles) return -1; + + return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++] + .tile_idx; +} + +// Calculates bitstream chunk size based on total buffer size and tile or tile +// group size. +static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size, + const int frame_or_tg_size, + size_t *remain_buf_size, + size_t max_buf_size, + int is_last_chunk) { + size_t this_chunk_size; + assert(*remain_buf_size > 0); + if (is_last_chunk) { + this_chunk_size = *remain_buf_size; + *remain_buf_size = 0; + } else { + const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size; + this_chunk_size = (size_t)(size_scale / frame_or_tg_size); + *remain_buf_size -= this_chunk_size; + assert(*remain_buf_size > 0); + } + assert(this_chunk_size > 0); + return this_chunk_size; +} + +// Initializes params required for pack bitstream tile. +static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + PackBSParams *const pack_bs_params_arr, + uint8_t obu_extn_header) { + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + // Tile group size in terms of number of tiles. + const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs; + uint8_t *tile_dst = dst; + uint8_t *tile_data_curr = dst; + // Max tile group count can not be more than MAX_TILES. + int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units + int tile_idx; + int tg_idx = 0; + int tile_count_in_tg = 0; + int new_tg = 1; + + // Populate pack bitstream params of all tiles. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info; + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + // Calculate tile size in mi units. + const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) * + (tile_info->mi_row_end - tile_info->mi_row_start); + int is_last_tile_in_tg = 0; + tile_count_in_tg++; + if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1)) + is_last_tile_in_tg = 1; + + // Populate pack bitstream params of this tile. + pack_bs_params->curr_tg_hdr_size = 0; + pack_bs_params->obu_extn_header = obu_extn_header; + pack_bs_params->saved_wb = saved_wb; + pack_bs_params->obu_header_size = 0; + pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params->new_tg = new_tg; + pack_bs_params->tile_col = tile_info->tile_col; + pack_bs_params->tile_row = tile_info->tile_row; + pack_bs_params->tile_size_mi = tile_size_mi; + tg_size_mi[tg_idx] += tile_size_mi; + + if (new_tg) new_tg = 0; + if (is_last_tile_in_tg) { + tile_count_in_tg = 0; + new_tg = 1; + tg_idx++; + } + } + + assert(cpi->available_bs_size > 0); + size_t tg_buf_size[MAX_TILES] = { 0 }; + size_t max_buf_size = cpi->available_bs_size; + size_t remain_buf_size = max_buf_size; + const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols; + + tile_idx = 0; + // Prepare obu, tile group and frame header of each tile group. + for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + int is_last_tg = tg_idx == cpi->num_tg - 1; + // Prorate bitstream buffer size based on tile group size and available + // buffer size. This buffer will be used to store headers and tile data. + tg_buf_size[tg_idx] = + get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size, + max_buf_size, is_last_tg); + + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_dst; + + // Write obu, tile group and frame header at first tile in the tile + // group. + av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx); + tile_dst += tg_buf_size[tg_idx]; + + // Exclude headers from tile group buffer size. + tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size; + tile_idx += tg_size_in_tiles; + } + + tg_idx = 0; + // Calculate bitstream buffer size of each tile in the tile group. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + + if (pack_bs_params->new_tg) { + max_buf_size = tg_buf_size[tg_idx]; + remain_buf_size = max_buf_size; + } + + // Prorate bitstream buffer size of this tile based on tile size and + // available buffer size. For this proration, header size is not accounted. + const size_t tile_buf_size = get_bs_chunk_size( + pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size, + max_buf_size, pack_bs_params->is_last_tile_in_tg); + pack_bs_params->tile_buf_size = tile_buf_size; + + // Update base address of bitstream buffer for tile and tile group. + if (pack_bs_params->new_tg) { + tile_dst = pack_bs_params->dst; + tile_data_curr = pack_bs_params->tile_data_curr; + // Account header size in first tile of a tile group. + pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size; + } else { + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_data_curr; + } + + if (pack_bs_params->is_last_tile_in_tg) tg_idx++; + tile_dst += pack_bs_params->tile_buf_size; + } +} + +// Worker hook function of pack bitsteam multithreading. +static int pack_bs_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + PackBSParams *const pack_bs_params = (PackBSParams *)arg2; + AV1_COMP *const cpi = thread_data->cpi; + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + + while (1) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pack_bs_sync->mutex_); +#endif + const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles); +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pack_bs_sync->mutex_); +#endif + if (tile_idx == -1) break; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + + av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]); + } + + return 1; +} + +// Prepares thread data and workers of pack bitsteam multithreading. +static void prepare_pack_bs_workers(AV1_COMP *const cpi, + PackBSParams *const pack_bs_params, + AVxWorkerHook hook, const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + if (i == 0) thread_data->td = &cpi->td; + + if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb; + + thread_data->cpi = cpi; + thread_data->start = i; + thread_data->thread_id = i; + av1_reset_pack_bs_thread_data(thread_data->td); + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = pack_bs_params; + } + + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; + const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; +#if CONFIG_MULTITHREAD + if (pack_bs_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_, + aom_malloc(sizeof(*pack_bs_sync->mutex_))); + if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); + } +#endif + pack_bs_sync->next_job_idx = 0; + + PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; + // Reset tile order data of pack bitstream + av1_zero_array(pack_bs_tile_order, num_tiles); + + // Populate pack bitstream tile order structure + for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + pack_bs_tile_order[tile_idx].abs_sum_level = + cpi->tile_data[tile_idx].abs_sum_level; + pack_bs_tile_order[tile_idx].tile_idx = tile_idx; + } + + // Sort tiles in descending order based on tile area. + qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order), + compare_tile_order); +} + +// Accumulates data after pack bitsteam processing. +static void accumulate_pack_bs_data( + AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr, + uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info, + int *const largest_tile_id, unsigned int *max_tile_size, + uint32_t *const obu_header_size, uint8_t **tile_data_start, + const int num_workers) { + const AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int tile_count = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + size_t curr_tg_data_size = 0; + int is_first_tg = 1; + uint8_t *curr_tg_start = dst; + size_t src_offset = 0; + size_t dst_offset = 0; + + for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) { + // PackBSParams stores all parameters required to pack tile and header + // info. + const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + uint32_t tile_size = 0; + + if (pack_bs_params->new_tg) { + curr_tg_start = dst + *total_size; + curr_tg_data_size = pack_bs_params->curr_tg_hdr_size; + *tile_data_start += pack_bs_params->curr_tg_hdr_size; + *obu_header_size = pack_bs_params->obu_header_size; + } + curr_tg_data_size += + pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4); + + if (pack_bs_params->buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params->buf.size; + } + tile_size += + (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size; + + // Pack all the chunks of tile bitstreams together + if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size); + + if (pack_bs_params->is_last_tile_in_tg) + av1_write_last_tile_info( + cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size, + curr_tg_start, &tile_size, tile_data_start, largest_tile_id, + &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header); + src_offset += pack_bs_params->tile_buf_size; + dst_offset += tile_size; + *total_size += tile_size; + } + + // Accumulate thread data + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int idx = num_workers - 1; idx >= 0; idx--) { + ThreadData const *td = mt_info->tile_thr_data[idx].td; + av1_accumulate_pack_bs_thread_data(cpi, td); + } +} + +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = mt_info->num_mod_workers[MOD_PACK_BS]; + + PackBSParams pack_bs_params[MAX_TILES]; + uint32_t tile_size[MAX_TILES] = { 0 }; + + for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++) + pack_bs_params[tile_idx].total_size = &tile_size[tile_idx]; + + init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header); + prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook, + num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, &cpi->common, num_workers); + accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info, + largest_tile_id, max_tile_size, obu_header_size, + tile_data_start, num_workers); +} + // Deallocate memory for CDEF search multi-thread synchronization. void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { (void)cdef_sync; @@ -1780,6 +2171,9 @@ static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { // Initializes cdef_sync parameters. static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) { +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); +#endif // CONFIG_MULTITHREAD cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; @@ -1896,6 +2290,12 @@ static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) { return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); } +// Computes num_workers for pack bitstream multi-threading. +static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) { + if (cpi->oxcf.max_threads <= 1) return 1; + return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads); +} + int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) { int num_mod_workers = 0; switch (mod_name) { @@ -1915,7 +2315,9 @@ int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) { case MOD_CDEF_SEARCH: num_mod_workers = compute_num_cdef_workers(cpi); break; + case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break; case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break; + case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break; default: assert(0); break; } return (num_mod_workers); |