1 files changed, 465 insertions, 63 deletions
diff --git a/third_party/libaom/source/libaom/av1/encoder/ethread.c b/third_party/libaom/source/libaom/av1/encoder/ethread.c
index 3735ca3c8b..d274b6b84f 100644
--- a/third_party/libaom/source/libaom/av1/encoder/ethread.c
+++ b/third_party/libaom/source/libaom/av1/encoder/ethread.c
@@ -11,9 +11,11 @@
 
 #include "av1/common/warped_motion.h"
 
+#include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/ethread.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/firstpass.h"
@@ -52,7 +54,7 @@ static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
 static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  const int mib_size = cm->seq_params.mib_size;
+  const int mib_size = cm->seq_params->mib_size;
   const int frame_lf_count =
       av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
   for (int row = 0; row < cm->tiles.rows; row++) {
@@ -68,7 +70,8 @@ static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
           const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
           MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
           MB_MODE_INFO *mbmi = mi[0];
-          if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params.sb_size)) {
+          if (mbmi->skip_txfm == 1 &&
+              (mbmi->bsize == cm->seq_params->sb_size)) {
             for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
               mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
             mbmi->delta_lf_from_base = xd->delta_lf_from_base;
@@ -362,7 +365,7 @@ static AOM_INLINE void switch_tile_and_get_next_job(
     *cur_tile_id = tile_id;
     const int unit_height = mi_size_high[fp_block_size];
     get_next_job(&tile_data[tile_id], current_mi_row,
-                 is_firstpass ? unit_height : cm->seq_params.mib_size);
+                 is_firstpass ? unit_height : cm->seq_params->mib_size);
   }
 }
 
@@ -441,13 +444,20 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 
   const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
   int end_of_frame = 0;
+
+  // When master thread does not have a valid job to process, xd->tile_ctx
+  // is not set and it contains NULL pointer. This can result in NULL pointer
+  // access violation if accessed beyond the encode stage. Hence, updating
+  // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+  // context to avoid NULL pointer access in subsequent stages.
+  thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
   while (1) {
     int current_mi_row = -1;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
     if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
-                      cm->seq_params.mib_size)) {
+                      cm->seq_params->mib_size)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
       switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
@@ -470,6 +480,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
+    td->abs_sum_level = 0;
 
     if (this_tile->allow_update_cdf) {
       td->mb.row_ctx = this_tile->row_ctx;
@@ -482,7 +493,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
     av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
                            &td->mb.e_mbd);
 
-    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
     if (td->mb.txfm_search_info.txb_rd_records != NULL) {
       av1_crc32c_calculator_init(
           &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator);
@@ -492,6 +503,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
+    this_tile->abs_sum_level += td->abs_sum_level;
     row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
     pthread_mutex_unlock(enc_row_mt_mutex_);
@@ -526,16 +538,12 @@ static int enc_worker_hook(void *arg1, void *unused) {
   return 1;
 }
 
-void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
   AV1_COMMON *const cm = &cpi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   MultiThreadInfo *const mt_info = &cpi->mt_info;
 
-  assert(mt_info->workers != NULL);
-  assert(mt_info->tile_thr_data != NULL);
-
-#if CONFIG_MULTITHREAD
-  if (cpi->oxcf.row_mt == 1) {
+  if (is_first_pass || cpi->oxcf.row_mt == 1) {
     AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
     if (enc_row_mt->mutex_ == NULL) {
       CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
@@ -543,24 +551,39 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
       if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
     }
   }
-  AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
-  if (gm_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, gm_sync->mutex_,
-                    aom_malloc(sizeof(*(gm_sync->mutex_))));
-    if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
-  }
-  AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
-  if (tf_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_)));
-    if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
-  }
-  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
-  if (cdef_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
-                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
-    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+
+  if (!is_first_pass) {
+    AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+    if (gm_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+                      aom_malloc(sizeof(*(gm_sync->mutex_))));
+      if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+    }
+#if !CONFIG_REALTIME_ONLY
+    AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+    if (tf_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+                      aom_malloc(sizeof(*tf_sync->mutex_)));
+      if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+    AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+    if (cdef_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                      aom_malloc(sizeof(*(cdef_sync->mutex_))));
+      if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+    }
   }
-#endif
+}
+#endif  // CONFIG_MULTITHREAD
+
+void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
 
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
@@ -576,7 +599,7 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
 
       // Create threads
       if (!winterface->reset(worker))
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_ERROR,
                            "Tile encoder thread creation failed");
     } else {
       // Main thread acts as a worker and uses the thread data in cpi.
@@ -625,10 +648,6 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
 
       alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
 
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->tmp_conv_dst,
-          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
-                               sizeof(*thread_data->td->tmp_conv_dst)));
       for (int j = 0; j < 2; ++j) {
         CHECK_MEM_ERROR(
             cm, thread_data->td->tmp_pred_bufs[j],
@@ -636,9 +655,14 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
                                  sizeof(*thread_data->td->tmp_pred_bufs[j])));
       }
 
+      const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+      CHECK_MEM_ERROR(cm, thread_data->td->pixel_gradient_info,
+                      aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) *
+                                 plane_types * MAX_SB_SQUARE));
+
       if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
         const int num_64x64_blocks =
-            (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+            (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
         CHECK_MEM_ERROR(
             cm, thread_data->td->vt64x64,
             aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
@@ -680,6 +704,10 @@ void av1_create_workers(AV1_COMP *cpi, int num_workers) {
 
       // Set up shared coeff buffers.
       av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
     }
     ++mt_info->num_workers;
   }
@@ -724,7 +752,7 @@ static AOM_INLINE void fp_create_enc_workers(AV1_COMP *cpi, int num_workers) {
       if (create_workers) {
         // Create threads
         if (!winterface->reset(worker))
-          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+          aom_internal_error(cm->error, AOM_CODEC_ERROR,
                              "Tile encoder thread creation failed");
       }
     } else {
@@ -764,7 +792,7 @@ static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
   }
 
   if (had_error)
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Failed to encode tile data");
 }
 
@@ -780,14 +808,15 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
         !frame_is_intra_only(&cpi->common))
       av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh,
                                              &thread_data->td->mb);
-    if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
-      aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
-      thread_data->td->mb.txfm_search_info.txb_rd_records = NULL;
-    }
-    if (thread_data->td != &cpi->td &&
-        cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
-      aom_free(thread_data->td->mb.mv_costs);
+    if (thread_data->td != &cpi->td) {
+      if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.mv_costs);
+      }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.dv_costs);
+      }
     }
+    av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb);
 
     // Accumulate counters.
     if (i > 0) {
@@ -822,6 +851,7 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
 
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
+    thread_data->td->abs_sum_level = 0;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -846,15 +876,19 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
                sizeof(MvCosts));
       }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
+                        (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+        memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+               sizeof(IntraBCMVCosts));
+      }
     }
+    av1_alloc_mb_data(cm, &thread_data->td->mb,
+                      cpi->sf.rt_sf.use_nonrd_pick_mode);
+
     // Reset cyclic refresh counters.
     av1_init_cyclic_refresh_counters(&thread_data->td->mb);
 
-    if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
-      CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records,
-                      (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords)));
-    }
-
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
@@ -867,6 +901,8 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         thread_data->td->mb.tmp_pred_bufs[j] =
             thread_data->td->tmp_pred_bufs[j];
       }
+      thread_data->td->mb.pixel_gradient_info =
+          thread_data->td->pixel_gradient_info;
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
@@ -904,11 +940,16 @@ static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
                sizeof(MvCosts));
       }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs,
+                        (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts)));
+        memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+               sizeof(IntraBCMVCosts));
+      }
     }
-    if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
-      CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records,
-                      (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords)));
-    }
+
+    av1_alloc_mb_data(cm, &thread_data->td->mb,
+                      cpi->sf.rt_sf.use_nonrd_pick_mode);
   }
 }
 #endif
@@ -1191,13 +1232,15 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
   for (int i = num_workers - 1; i >= 0; i--) {
     EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
-    if (thread_data->td != &cpi->td &&
-        cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
-      aom_free(thread_data->td->mb.mv_costs);
-    }
-    if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
-      aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
+    if (thread_data->td != &cpi->td) {
+      if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.mv_costs);
+      }
+      if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) {
+        aom_free(thread_data->td->mb.dv_costs);
+      }
     }
+    av1_dealloc_mb_data(cm, &thread_data->td->mb);
   }
 }
 
@@ -1277,11 +1320,15 @@ static int tpl_worker_hook(void *arg1, void *unused) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCK *x = &thread_data->td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
+  TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
   CommonModeInfoParams *mi_params = &cm->mi_params;
-  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   TX_SIZE tx_size = max_txsize_lookup[bsize];
   int mi_height = mi_size_high[bsize];
-  int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working;
+  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
   for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
        mi_row += num_active_workers * mi_height) {
     // Motion estimation row boundary
@@ -1290,7 +1337,7 @@ static int tpl_worker_hook(void *arg1, void *unused) {
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
-    av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size);
+    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size);
   }
   return 1;
 }
@@ -1370,6 +1417,24 @@ static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       // OBMC buffers are used only to init MS params and remain unused when
       // called from tpl, hence set the buffers to defaults.
       av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+    }
+  }
+}
+
+// Accumulate transform stats after tpl.
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+                                      const MultiThreadInfo *mt_info,
+                                      int num_workers) {
+  TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    ThreadData *td = thread_data->td;
+    if (td != main_td) {
+      const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+      av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
     }
   }
 }
@@ -1379,7 +1444,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   CommonModeInfoParams *mi_params = &cm->mi_params;
   MultiThreadInfo *mt_info = &cpi->mt_info;
-  TplParams *tpl_data = &cpi->tpl_data;
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
   AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
   int mb_rows = mi_params->mb_rows;
   int num_workers =
@@ -1398,6 +1463,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
   prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
 }
 
 // Deallocate memory for temporal filter multi-thread synchronization.
@@ -1752,6 +1818,331 @@ void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+  const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+  const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+  if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+    return -1;
+  else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+    return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+  else
+    return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+    AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+  assert(pack_bs_sync->next_job_idx <= num_tiles);
+  if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+  return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+      .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+                                           const int frame_or_tg_size,
+                                           size_t *remain_buf_size,
+                                           size_t max_buf_size,
+                                           int is_last_chunk) {
+  size_t this_chunk_size;
+  assert(*remain_buf_size > 0);
+  if (is_last_chunk) {
+    this_chunk_size = *remain_buf_size;
+    *remain_buf_size = 0;
+  } else {
+    const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+    this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+    *remain_buf_size -= this_chunk_size;
+    assert(*remain_buf_size > 0);
+  }
+  assert(this_chunk_size > 0);
+  return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+                                     struct aom_write_bit_buffer *saved_wb,
+                                     PackBSParams *const pack_bs_params_arr,
+                                     uint8_t obu_extn_header) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  // Tile group size in terms of number of tiles.
+  const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+  uint8_t *tile_dst = dst;
+  uint8_t *tile_data_curr = dst;
+  // Max tile group count can not be more than MAX_TILES.
+  int tg_size_mi[MAX_TILES] = { 0 };  // Size of tile group in mi units
+  int tile_idx;
+  int tg_idx = 0;
+  int tile_count_in_tg = 0;
+  int new_tg = 1;
+
+  // Populate pack bitstream params of all tiles.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    // Calculate tile size in mi units.
+    const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+                             (tile_info->mi_row_end - tile_info->mi_row_start);
+    int is_last_tile_in_tg = 0;
+    tile_count_in_tg++;
+    if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+      is_last_tile_in_tg = 1;
+
+    // Populate pack bitstream params of this tile.
+    pack_bs_params->curr_tg_hdr_size = 0;
+    pack_bs_params->obu_extn_header = obu_extn_header;
+    pack_bs_params->saved_wb = saved_wb;
+    pack_bs_params->obu_header_size = 0;
+    pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+    pack_bs_params->new_tg = new_tg;
+    pack_bs_params->tile_col = tile_info->tile_col;
+    pack_bs_params->tile_row = tile_info->tile_row;
+    pack_bs_params->tile_size_mi = tile_size_mi;
+    tg_size_mi[tg_idx] += tile_size_mi;
+
+    if (new_tg) new_tg = 0;
+    if (is_last_tile_in_tg) {
+      tile_count_in_tg = 0;
+      new_tg = 1;
+      tg_idx++;
+    }
+  }
+
+  assert(cpi->available_bs_size > 0);
+  size_t tg_buf_size[MAX_TILES] = { 0 };
+  size_t max_buf_size = cpi->available_bs_size;
+  size_t remain_buf_size = max_buf_size;
+  const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+  tile_idx = 0;
+  // Prepare obu, tile group and frame header of each tile group.
+  for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    int is_last_tg = tg_idx == cpi->num_tg - 1;
+    // Prorate bitstream buffer size based on tile group size and available
+    // buffer size. This buffer will be used to store headers and tile data.
+    tg_buf_size[tg_idx] =
+        get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+                          max_buf_size, is_last_tg);
+
+    pack_bs_params->dst = tile_dst;
+    pack_bs_params->tile_data_curr = tile_dst;
+
+    // Write obu, tile group and frame header at first tile in the tile
+    // group.
+    av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+    tile_dst += tg_buf_size[tg_idx];
+
+    // Exclude headers from tile group buffer size.
+    tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+    tile_idx += tg_size_in_tiles;
+  }
+
+  tg_idx = 0;
+  // Calculate bitstream buffer size of each tile in the tile group.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+    if (pack_bs_params->new_tg) {
+      max_buf_size = tg_buf_size[tg_idx];
+      remain_buf_size = max_buf_size;
+    }
+
+    // Prorate bitstream buffer size of this tile based on tile size and
+    // available buffer size. For this proration, header size is not accounted.
+    const size_t tile_buf_size = get_bs_chunk_size(
+        pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+        max_buf_size, pack_bs_params->is_last_tile_in_tg);
+    pack_bs_params->tile_buf_size = tile_buf_size;
+
+    // Update base address of bitstream buffer for tile and tile group.
+    if (pack_bs_params->new_tg) {
+      tile_dst = pack_bs_params->dst;
+      tile_data_curr = pack_bs_params->tile_data_curr;
+      // Account header size in first tile of a tile group.
+      pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+    } else {
+      pack_bs_params->dst = tile_dst;
+      pack_bs_params->tile_data_curr = tile_data_curr;
+    }
+
+    if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+    tile_dst += pack_bs_params->tile_buf_size;
+  }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+
+  while (1) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_sync->mutex_);
+#endif
+    const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pack_bs_sync->mutex_);
+#endif
+    if (tile_idx == -1) break;
+    TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+    av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+  }
+
+  return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+                                    PackBSParams *const pack_bs_params,
+                                    AVxWorkerHook hook, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    if (i == 0) thread_data->td = &cpi->td;
+
+    if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+    thread_data->cpi = cpi;
+    thread_data->start = i;
+    thread_data->thread_id = i;
+    av1_reset_pack_bs_thread_data(thread_data->td);
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = pack_bs_params;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+  const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+#if CONFIG_MULTITHREAD
+  if (pack_bs_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+                    aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+    if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+  }
+#endif
+  pack_bs_sync->next_job_idx = 0;
+
+  PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+  // Reset tile order data of pack bitstream
+  av1_zero_array(pack_bs_tile_order, num_tiles);
+
+  // Populate pack bitstream tile order structure
+  for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    pack_bs_tile_order[tile_idx].abs_sum_level =
+        cpi->tile_data[tile_idx].abs_sum_level;
+    pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+  }
+
+  // Sort tiles in descending order based on tile area.
+  qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+        compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+    AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+    uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+    int *const largest_tile_id, unsigned int *max_tile_size,
+    uint32_t *const obu_header_size, uint8_t **tile_data_start,
+    const int num_workers) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_count = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  size_t curr_tg_data_size = 0;
+  int is_first_tg = 1;
+  uint8_t *curr_tg_start = dst;
+  size_t src_offset = 0;
+  size_t dst_offset = 0;
+
+  for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+    // PackBSParams stores all parameters required to pack tile and header
+    // info.
+    const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    uint32_t tile_size = 0;
+
+    if (pack_bs_params->new_tg) {
+      curr_tg_start = dst + *total_size;
+      curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+      *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+      *obu_header_size = pack_bs_params->obu_header_size;
+    }
+    curr_tg_data_size +=
+        pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+    if (pack_bs_params->buf.size > *max_tile_size) {
+      *largest_tile_id = tile_idx;
+      *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+    }
+    tile_size +=
+        (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+    // Pack all the chunks of tile bitstreams together
+    if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+    if (pack_bs_params->is_last_tile_in_tg)
+      av1_write_last_tile_info(
+          cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+          curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+          &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+    src_offset += pack_bs_params->tile_buf_size;
+    dst_offset += tile_size;
+    *total_size += tile_size;
+  }
+
+  // Accumulate thread data
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int idx = num_workers - 1; idx >= 0; idx--) {
+    ThreadData const *td = mt_info->tile_thr_data[idx].td;
+    av1_accumulate_pack_bs_thread_data(cpi, td);
+  }
+}
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  const int num_workers = mt_info->num_mod_workers[MOD_PACK_BS];
+
+  PackBSParams pack_bs_params[MAX_TILES];
+  uint32_t tile_size[MAX_TILES] = { 0 };
+
+  for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+    pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+  init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+  prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+                          num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
+  accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+                          largest_tile_id, max_tile_size, obu_header_size,
+                          tile_data_start, num_workers);
+}
+
 // Deallocate memory for CDEF search multi-thread synchronization.
 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
   (void)cdef_sync;
@@ -1780,6 +2171,9 @@ static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
 
 // Initializes cdef_sync parameters.
 static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif  // CONFIG_MULTITHREAD
   cdef_sync->end_of_frame = 0;
   cdef_sync->fbr = 0;
   cdef_sync->fbc = 0;
@@ -1896,6 +2290,12 @@ static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
   return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
+}
+
 int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
   int num_mod_workers = 0;
   switch (mod_name) {
@@ -1915,7 +2315,9 @@ int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
     case MOD_CDEF_SEARCH:
       num_mod_workers = compute_num_cdef_workers(cpi);
       break;
+    case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
     case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+    case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
     default: assert(0); break;
   }
   return (num_mod_workers);