aboutsummaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c8
-rw-r--r--vp9/common/vp9_blockd.h2
-rw-r--r--vp9/common/vp9_common.h21
-rw-r--r--vp9/common/vp9_entropymode.c1
-rw-r--r--vp9/common/vp9_idct.c2
-rw-r--r--vp9/common/vp9_loopfilter.c38
-rw-r--r--vp9/common/vp9_mfqe.c2
-rw-r--r--vp9/common/vp9_reconinter.c13
-rw-r--r--vp9/common/vp9_rtcd_defs.pl33
-rw-r--r--vp9/common/vp9_scan.c4
-rw-r--r--vp9/common/vp9_scan.h12
-rw-r--r--vp9/common/vp9_thread_common.c26
-rw-r--r--vp9/decoder/vp9_decodeframe.c40
-rw-r--r--vp9/decoder/vp9_decodemv.c12
-rw-r--r--vp9/decoder/vp9_decoder.c21
-rw-r--r--vp9/decoder/vp9_decoder.h2
-rw-r--r--vp9/decoder/vp9_detokenize.c5
-rw-r--r--vp9/decoder/vp9_detokenize.h5
-rw-r--r--vp9/encoder/arm/neon/vp9_dct_neon.c942
-rw-r--r--vp9/encoder/arm/neon/vp9_denoiser_neon.c2
-rw-r--r--vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c60
-rw-r--r--vp9/encoder/arm/neon/vp9_error_neon.c99
-rw-r--r--vp9/encoder/arm/neon/vp9_highbd_error_neon.c49
-rw-r--r--vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c872
-rw-r--r--vp9/encoder/arm/neon/vp9_quantize_neon.c75
-rw-r--r--vp9/encoder/arm/neon/vp9_temporal_filter_neon.c849
-rw-r--r--vp9/encoder/vp9_aq_complexity.c2
-rw-r--r--vp9/encoder/vp9_bitstream.c13
-rw-r--r--vp9/encoder/vp9_block.h15
-rw-r--r--vp9/encoder/vp9_context_tree.c15
-rw-r--r--vp9/encoder/vp9_denoiser.c8
-rw-r--r--vp9/encoder/vp9_encodeframe.c263
-rw-r--r--vp9/encoder/vp9_encodemb.c239
-rw-r--r--vp9/encoder/vp9_encodemb.h7
-rw-r--r--vp9/encoder/vp9_encoder.c2480
-rw-r--r--vp9/encoder/vp9_encoder.h307
-rw-r--r--vp9/encoder/vp9_ethread.c15
-rw-r--r--vp9/encoder/vp9_ext_ratectrl.c22
-rw-r--r--vp9/encoder/vp9_ext_ratectrl.h4
-rw-r--r--vp9/encoder/vp9_firstpass.c152
-rw-r--r--vp9/encoder/vp9_firstpass.h31
-rw-r--r--vp9/encoder/vp9_firstpass_stats.h54
-rw-r--r--vp9/encoder/vp9_frame_scale.c18
-rw-r--r--vp9/encoder/vp9_mbgraph.c30
-rw-r--r--vp9/encoder/vp9_mcomp.c204
-rw-r--r--vp9/encoder/vp9_mcomp.h14
-rw-r--r--vp9/encoder/vp9_multi_thread.c4
-rw-r--r--vp9/encoder/vp9_noise_estimate.c2
-rw-r--r--vp9/encoder/vp9_pickmode.c73
-rw-r--r--vp9/encoder/vp9_quantize.c48
-rw-r--r--vp9/encoder/vp9_ratectrl.c66
-rw-r--r--vp9/encoder/vp9_ratectrl.h2
-rw-r--r--vp9/encoder/vp9_rd.c40
-rw-r--r--vp9/encoder/vp9_rd.h7
-rw-r--r--vp9/encoder/vp9_rdopt.c678
-rw-r--r--vp9/encoder/vp9_resize.c6
-rw-r--r--vp9/encoder/vp9_speed_features.c111
-rw-r--r--vp9/encoder/vp9_speed_features.h61
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c82
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h5
-rw-r--r--vp9/encoder/vp9_temporal_filter.c4
-rw-r--r--vp9/encoder/vp9_temporal_filter_constants.h (renamed from vp9/encoder/x86/temporal_filter_constants.h)8
-rw-r--r--vp9/encoder/vp9_tokenize.c2
-rw-r--r--vp9/encoder/vp9_tpl_model.c1541
-rw-r--r--vp9/encoder/vp9_tpl_model.h46
-rw-r--r--vp9/encoder/x86/highbd_temporal_filter_sse4.c7
-rw-r--r--vp9/encoder/x86/temporal_filter_sse4.c2
-rw-r--r--vp9/encoder/x86/vp9_diamond_search_sad_avx.c317
-rw-r--r--vp9/encoder/x86/vp9_frame_scale_ssse3.c24
-rw-r--r--vp9/encoder/x86/vp9_quantize_avx2.c65
-rw-r--r--vp9/encoder/x86/vp9_quantize_sse2.c11
-rw-r--r--vp9/encoder/x86/vp9_quantize_ssse3.c21
-rw-r--r--vp9/ratectrl_rtc.cc152
-rw-r--r--vp9/ratectrl_rtc.h66
-rw-r--r--vp9/simple_encode.cc10
-rw-r--r--vp9/simple_encode.h2
-rw-r--r--vp9/vp9_cx_iface.c68
-rw-r--r--vp9/vp9_dx_iface.c18
-rw-r--r--vp9/vp9cx.mk18
79 files changed, 7120 insertions, 3535 deletions
diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
index 219ff63cb..b43d7fa4f 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -18,7 +18,7 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/inv_txfm.h"
-// Use macros to make sure argument lane is passed in as an constant integer.
+// Use macros to make sure argument lane is passed in as a constant integer.
#define vmull_lane_s32_dual(in, c, lane, out) \
do { \
@@ -64,9 +64,9 @@ highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
#define highbd_iadst_half_butterfly(in, c, lane, out) \
do { \
- int64x2x2_t t[2]; \
- vmull_lane_s32_dual(in, c, lane, t); \
- out = highbd_dct_const_round_shift_low_8(t); \
+ int64x2x2_t _t[2]; \
+ vmull_lane_s32_dual(in, c, lane, _t); \
+ out = highbd_dct_const_round_shift_low_8(_t); \
} while (0)
#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d7de46cf4..aa13d8a0d 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -54,7 +54,7 @@ typedef struct {
// decoder implementation modules critically rely on the defined entry values
// specified herein. They should be refactored concurrently.
-#define NONE (-1)
+#define NO_REF_FRAME (-1)
#define INTRA_FRAME 0
#define LAST_FRAME 1
#define GOLDEN_FRAME 2
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 8d2bed38e..d63bad93d 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -46,27 +46,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
return num_values > 0 ? get_msb(num_values) + 1 : 0;
}
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr) \
- do { \
- assert(&(cm)->error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval " at %s:%d", __FILE__, \
- __LINE__); \
- } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr) \
- do { \
- assert(&(cm)->error.setjmp); \
- (lval) = (expr); \
- if (!(lval)) \
- vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
- "Failed to allocate " #lval); \
- } while (0)
-#endif
-
#define VP9_SYNC_CODE_0 0x49
#define VP9_SYNC_CODE_1 0x83
#define VP9_SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index bda824de3..9289fc9e1 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -381,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
}
if (cm->tx_mode == TX_MODE_SELECT) {
- int j;
unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 69069042c..71be0f310 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
+ assert(((intptr_t)input) % 32 == 0);
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob == 1) /* DC only DCT coefficient. */
@@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
+ assert(((intptr_t)input) % 32 == 0);
if (eob == 1)
vpx_idct32x32_1_add(input, dest, stride);
else if (eob <= 34)
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 765cb1172..1a9d45ae7 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
break;
default:
for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
- const int shift_y = shift_32_y[idx_32];
- const int shift_uv = shift_32_uv[idx_32];
+ const int shift_y_32 = shift_32_y[idx_32];
+ const int shift_uv_32 = shift_32_uv[idx_32];
const int mi_32_col_offset = ((idx_32 & 1) << 2);
const int mi_32_row_offset = ((idx_32 >> 1) << 2);
if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
continue;
switch (mip[0]->sb_type) {
case BLOCK_32X32:
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
break;
case BLOCK_32X16:
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
if (mi_32_row_offset + 2 >= max_rows) continue;
mip2 = mip + mode_info_stride * 2;
- build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+ build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm);
break;
case BLOCK_16X32:
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
if (mi_32_col_offset + 2 >= max_cols) continue;
mip2 = mip + 2;
- build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+ build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm);
break;
default:
for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
- const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
- const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+ const int shift_y_16 = shift_y_32 + shift_16_y[idx_16];
+ const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16];
const int mi_16_col_offset =
mi_32_col_offset + ((idx_16 & 1) << 1);
const int mi_16_row_offset =
@@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
switch (mip[0]->sb_type) {
case BLOCK_16X16:
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
break;
case BLOCK_16X8:
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
if (mi_16_row_offset + 1 >= max_rows) continue;
mip2 = mip + mode_info_stride;
- build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm);
+ build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm);
break;
case BLOCK_8X16:
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
if (mi_16_col_offset + 1 >= max_cols) continue;
mip2 = mip + 1;
- build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm);
+ build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm);
break;
default: {
- const int shift_y =
- shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0];
- build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+ const int shift_y_8_0 = shift_y_16 + shift_8_y[0];
+ build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm);
mip += offset[0];
for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
- const int shift_y = shift_32_y[idx_32] +
- shift_16_y[idx_16] + shift_8_y[idx_8];
+ const int shift_y_8 = shift_y_16 + shift_8_y[idx_8];
const int mi_8_col_offset =
mi_16_col_offset + ((idx_8 & 1));
const int mi_8_row_offset =
@@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
if (mi_8_col_offset >= max_cols ||
mi_8_row_offset >= max_rows)
continue;
- build_y_mask(lfi_n, mip[0], shift_y, lfm);
+ build_y_mask(lfi_n, mip[0], shift_y_8, lfm);
}
break;
}
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index e76d771b8..cf60fa40f 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
const int bsl = b_width_log2_lookup[bs];
PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
const BLOCK_SIZE subsize = get_subsize(bs, partition);
+ BLOCK_SIZE mfqe_bs, bs_tmp;
if (cur_bs < BLOCK_8X8) {
// If there are blocks smaller than 8x8, it must be on the boundary.
@@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
uv_offset = 8;
}
switch (partition) {
- BLOCK_SIZE mfqe_bs, bs_tmp;
case PARTITION_HORZ:
if (bs == BLOCK_64X64) {
mfqe_bs = BLOCK_64X32;
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index ff59ff504..4878dc15e 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -158,18 +158,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
// Co-ordinate of containing block to pixel precision.
const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+ const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+ uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer,
+ ref_buf->v_buffer };
+ const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride,
+ ref_buf->uv_stride };
#if 0 // CONFIG_BETTER_HW_COMPATIBILITY
assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
xd->mi[0]->sb_type != BLOCK_8X4);
assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
#endif
- if (plane == 0)
- pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
- else if (plane == 1)
- pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
- else
- pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+ pre_buf->buf = buf_array[plane];
+ pre_buf->stride = stride_array[plane];
pre_buf->buf +=
scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f4bd9772c..3ecbd5417 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,9 @@ struct macroblockd;
/* Encoder forward decls */
struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
struct search_site_config;
struct mv;
union int_mv;
@@ -127,24 +129,21 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+specialize qw/vp9_block_error_fp neon avx2 sse2/;
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- specialize qw/vp9_block_error avx2 sse2/;
-
- specialize qw/vp9_block_error_fp avx2 sse2/;
+ specialize qw/vp9_block_error neon avx2 sse2/;
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/vp9_highbd_block_error sse2/;
+ specialize qw/vp9_highbd_block_error neon sse2/;
} else {
- specialize qw/vp9_block_error avx2 msa sse2/;
-
- specialize qw/vp9_block_error_fp neon avx2 sse2/;
+ specialize qw/vp9_block_error neon avx2 msa sse2/;
}
# fdct functions
@@ -174,19 +173,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
#
# Motion search
#
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx neon/;
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad neon/;
#
# Apply temporal filter
#
if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
-specialize qw/vp9_apply_temporal_filter sse4_1/;
+specialize qw/vp9_apply_temporal_filter sse4_1 neon/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
- specialize qw/vp9_highbd_apply_temporal_filter sse4_1/;
+ specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/;
}
}
@@ -195,10 +194,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# ENCODEMB INVOKE
- add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
specialize qw/vp9_highbd_quantize_fp avx2 neon/;
- add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+ add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
# fdct functions
@@ -206,8 +205,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_fht4x4 neon/;
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_highbd_fht8x8 neon/;
add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_highbd_fht16x16 neon/;
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index 8bea61dea..adacb7ef9 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -688,14 +688,14 @@ DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024,
};
-const scan_order vp9_default_scan_orders[TX_SIZES] = {
+const ScanOrder vp9_default_scan_orders[TX_SIZES] = {
{ default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors },
{ default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors },
};
-const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = {
{ // TX_4X4
{ default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
{ row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors },
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 72a9a5ec4..3d1dcc66d 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,14 +23,14 @@ extern "C" {
#define MAX_NEIGHBORS 2
-typedef struct {
+typedef struct ScanOrder {
const int16_t *scan;
const int16_t *iscan;
const int16_t *neighbors;
-} scan_order;
+} ScanOrder;
-extern const scan_order vp9_default_scan_orders[TX_SIZES];
-extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+extern const ScanOrder vp9_default_scan_orders[TX_SIZES];
+extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES];
static INLINE int get_coef_context(const int16_t *neighbors,
const uint8_t *token_cache, int c) {
@@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors,
1;
}
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
- PLANE_TYPE type, int block_idx) {
+static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+ PLANE_TYPE type, int block_idx) {
const MODE_INFO *const mi = xd->mi[0];
if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index b3d50162b..8df18af3b 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -283,7 +283,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
{
int i;
- CHECK_MEM_ERROR(cm, lf_sync->mutex,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
vpx_malloc(sizeof(*lf_sync->mutex) * rows));
if (lf_sync->mutex) {
for (i = 0; i < rows; ++i) {
@@ -291,7 +291,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
}
}
- CHECK_MEM_ERROR(cm, lf_sync->cond,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
vpx_malloc(sizeof(*lf_sync->cond) * rows));
if (lf_sync->cond) {
for (i = 0; i < rows; ++i) {
@@ -299,23 +299,21 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
}
}
- CHECK_MEM_ERROR(cm, lf_sync->lf_mutex,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
vpx_malloc(sizeof(*lf_sync->lf_mutex)));
pthread_mutex_init(lf_sync->lf_mutex, NULL);
- CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
if (lf_sync->recon_done_mutex) {
- int i;
for (i = 0; i < rows; ++i) {
pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
}
}
- CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
if (lf_sync->recon_done_cond) {
- int i;
for (i = 0; i < rows; ++i) {
pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
}
@@ -323,15 +321,15 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
}
#endif // CONFIG_MULTITHREAD
- CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
lf_sync->num_workers = num_workers;
lf_sync->num_active_workers = lf_sync->num_workers;
- CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
- CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+ CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
mi_cols_aligned_to_sb(cm->mi_rows) >>
MI_BLOCK_SIZE_LOG2));
@@ -390,10 +388,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
int return_val = -1;
- int cur_row;
const int max_rows = cm->mi_rows;
#if CONFIG_MULTITHREAD
+ int cur_row;
const int tile_cols = 1 << cm->log2_tile_cols;
pthread_mutex_lock(lf_sync->lf_mutex);
@@ -430,14 +428,8 @@ static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
#else
(void)lf_sync;
if (cm->lf_row < max_rows) {
- cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
return_val = cm->lf_row;
cm->lf_row += MI_BLOCK_SIZE;
- if (cm->lf_row < max_rows) {
- /* If this is not the last row, make sure the next row is also decoded.
- * This is because the intra predict has to happen before loop filter */
- cur_row += 1;
- }
}
#endif // CONFIG_MULTITHREAD
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 2a27e6fdb..c5892156f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -323,9 +323,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
if (!mi->skip) {
const TX_TYPE tx_type =
(plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
- const scan_order *sc = (plane || xd->lossless)
- ? &vp9_default_scan_orders[tx_size]
- : &vp9_scan_orders[tx_size][tx_type];
+ const ScanOrder *sc = (plane || xd->lossless)
+ ? &vp9_default_scan_orders[tx_size]
+ : &vp9_scan_orders[tx_size][tx_type];
const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
mi->segment_id);
if (eob > 0) {
@@ -348,9 +348,9 @@ static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
struct macroblockd_plane *const pd = &xd->plane[plane];
const TX_TYPE tx_type =
(plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
- const scan_order *sc = (plane || xd->lossless)
- ? &vp9_default_scan_orders[tx_size]
- : &vp9_scan_orders[tx_size][tx_type];
+ const ScanOrder *sc = (plane || xd->lossless)
+ ? &vp9_default_scan_orders[tx_size]
+ : &vp9_scan_orders[tx_size][tx_type];
*pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
mi->segment_id);
/* Keep the alignment to 16 */
@@ -393,7 +393,7 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
int mi_row, int mi_col) {
MACROBLOCKD *const xd = &twd->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
- const scan_order *sc = &vp9_default_scan_orders[tx_size];
+ const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
mi->segment_id);
uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
@@ -423,7 +423,7 @@ static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
TX_SIZE tx_size) {
MACROBLOCKD *const xd = &twd->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
- const scan_order *sc = &vp9_default_scan_orders[tx_size];
+ const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
mi->segment_id);
@@ -1469,7 +1469,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) {
vpx_free(cm->cur_frame->mvs);
cm->cur_frame->mi_rows = cm->mi_rows;
cm->cur_frame->mi_cols = cm->mi_cols;
- CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+ CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
(MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
sizeof(*cm->cur_frame->mvs)));
}
@@ -1776,7 +1776,8 @@ static void vp9_jobq_alloc(VP9Decoder *pbi) {
if (jobq_size > row_mt_worker_data->jobq_size) {
vpx_free(row_mt_worker_data->jobq_buf);
- CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+ vpx_calloc(1, jobq_size));
vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
jobq_size);
row_mt_worker_data->jobq_size = jobq_size;
@@ -1923,7 +1924,7 @@ static int row_decode_worker_hook(void *arg1, void *arg2) {
const int is_last_row = sb_rows - 1 == cur_sb_row;
int mi_col_start, mi_col_end;
if (!tile_data_recon)
- CHECK_MEM_ERROR(cm, tile_data_recon,
+ CHECK_MEM_ERROR(&cm->error, tile_data_recon,
vpx_memalign(32, sizeof(TileWorkerData)));
tile_data_recon->xd = pbi->mb;
@@ -2025,7 +2026,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
if (cm->lf.filter_level && !cm->skip_loop_filter &&
pbi->lf_worker.data1 == NULL) {
- CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+ CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
vpx_memalign(32, sizeof(LFWorkerData)));
pbi->lf_worker.hook = vp9_loop_filter_worker;
if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
@@ -2192,8 +2193,6 @@ static int tile_worker_hook(void *arg1, void *arg2) {
volatile int mi_row = 0;
volatile int n = tile_data->buf_start;
- tile_data->error_info.setjmp = 1;
-
if (setjmp(tile_data->error_info.jmp)) {
tile_data->error_info.setjmp = 0;
tile_data->xd.corrupted = 1;
@@ -2206,6 +2205,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
}
return 0;
}
+ tile_data->error_info.setjmp = 1;
tile_data->xd.corrupted = 0;
@@ -2285,7 +2285,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
if (pbi->num_tile_workers == 0) {
const int num_threads = pbi->max_threads;
- CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
for (n = 0; n < num_threads; ++n) {
VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2293,6 +2293,11 @@ static INLINE void init_mt(VP9Decoder *pbi) {
winterface->init(worker);
if (n < num_threads - 1 && !winterface->reset(worker)) {
+ do {
+ winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
+ } while (--pbi->num_tile_workers != 0);
+ vpx_free(pbi->tile_workers);
+ pbi->tile_workers = NULL;
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Tile decoder thread creation failed");
}
@@ -2824,7 +2829,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
const int num_jobs = sb_rows << cm->log2_tile_cols;
if (pbi->row_mt_worker_data == NULL) {
- CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+ CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
#if CONFIG_MULTITHREAD
pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
@@ -3006,7 +3011,8 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
// platforms without DECLARE_ALIGNED().
assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
vpx_free(pbi->tile_worker_data);
- CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+ CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+ vpx_memalign(32, twd_size));
pbi->total_tiles = tile_rows * tile_cols;
}
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index db3e74663..0989cde58 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
mi->skip = read_skip(cm, xd, mi->segment_id, r);
mi->tx_size = read_tx_size(cm, xd, 1, r);
mi->ref_frame[0] = INTRA_FRAME;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
switch (bsize) {
case BLOCK_4X4:
@@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
}
}
-// Read the referncence frame
+// Read the reference frame
static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
vpx_reader *r, int segment_id,
MV_REFERENCE_FRAME ref_frame[2]) {
@@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
- ref_frame[1] = NONE;
+ ref_frame[1] = NO_REF_FRAME;
} else {
const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
// FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
@@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
ref_frame[0] = LAST_FRAME;
}
- ref_frame[1] = NONE;
+ ref_frame[1] = NO_REF_FRAME;
} else {
assert(0 && "Invalid prediction mode.");
}
@@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
mi->interp_filter = SWITCHABLE_FILTERS;
mi->ref_frame[0] = INTRA_FRAME;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
}
static INLINE int is_mv_valid(const MV *mv) {
@@ -708,7 +708,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
mi->mode = ZEROMV;
if (bsize < BLOCK_8X8) {
vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
- "Invalid usage of segement feature on small blocks");
+ "Invalid usage of segment feature on small blocks");
return;
}
} else {
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7db8ed72d..5a7e9f9ab 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -66,7 +66,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
{
int i;
CHECK_MEM_ERROR(
- cm, row_mt_worker_data->recon_sync_mutex,
+ &cm->error, row_mt_worker_data->recon_sync_mutex,
vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
if (row_mt_worker_data->recon_sync_mutex) {
for (i = 0; i < num_jobs; ++i) {
@@ -75,7 +75,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
}
CHECK_MEM_ERROR(
- cm, row_mt_worker_data->recon_sync_cond,
+ &cm->error, row_mt_worker_data->recon_sync_cond,
vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
if (row_mt_worker_data->recon_sync_cond) {
for (i = 0; i < num_jobs; ++i) {
@@ -86,24 +86,24 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
#endif
row_mt_worker_data->num_sbs = num_sbs;
for (plane = 0; plane < 3; ++plane) {
- CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
- vpx_memalign(16, dqcoeff_size));
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
+ vpx_memalign(32, dqcoeff_size));
memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
- CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
sizeof(*row_mt_worker_data->eob[plane])));
}
- CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
vpx_calloc(num_sbs * PARTITIONS_PER_SB,
sizeof(*row_mt_worker_data->partition)));
- CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
// allocate memory for thread_data
if (row_mt_worker_data->thread_data == NULL) {
const size_t thread_size =
max_threads * sizeof(*row_mt_worker_data->thread_data);
- CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+ CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
vpx_memalign(32, thread_size));
}
}
@@ -181,9 +181,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
cm->error.setjmp = 1;
- CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(&cm->error, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
CHECK_MEM_ERROR(
- cm, cm->frame_contexts,
+ &cm->error, cm->frame_contexts,
(FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
pbi->need_resync = 1;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index b0ef83c73..2e198d552 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -54,7 +54,7 @@ typedef struct TileWorkerData {
VP9LfSync *lf_sync;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
struct vpx_internal_error_info error_info;
} TileWorkerData;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3ed1bd6ff..d957dc34e 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -272,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
}
}
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
- const scan_order *sc, int x, int y, TX_SIZE tx_size,
- int seg_id) {
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+ int x, int y, TX_SIZE tx_size, int seg_id) {
vpx_reader *r = &twd->bit_reader;
MACROBLOCKD *xd = &twd->xd;
struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index a32052fff..a8e47021b 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,9 +19,8 @@
extern "C" {
#endif
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
- const scan_order *sc, int x, int y, TX_SIZE tx_size,
- int seg_id);
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+ int x, int y, TX_SIZE tx_size, int seg_id);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index 5961be5f3..997b5477e 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -20,6 +20,7 @@
#include "vpx_dsp/arm/fdct_neon.h"
#include "vpx_dsp/arm/fdct4x4_neon.h"
#include "vpx_dsp/arm/fdct8x8_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
int stride) {
@@ -1228,4 +1229,945 @@ void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
}
}
+static INLINE void highbd_load_buffer_8x8(const int16_t *input,
+ int32x4_t *lo /*[8]*/,
+ int32x4_t *hi /*[8]*/, int stride) {
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+ lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi,
+ const int bit) {
+ int32x4_t sign_lo[8], sign_hi[8];
+ sign_lo[0] = vshrq_n_s32(lo[0], 31);
+ sign_hi[0] = vshrq_n_s32(hi[0], 31);
+ sign_lo[1] = vshrq_n_s32(lo[1], 31);
+ sign_hi[1] = vshrq_n_s32(hi[1], 31);
+ sign_lo[2] = vshrq_n_s32(lo[2], 31);
+ sign_hi[2] = vshrq_n_s32(hi[2], 31);
+ sign_lo[3] = vshrq_n_s32(lo[3], 31);
+ sign_hi[3] = vshrq_n_s32(hi[3], 31);
+ sign_lo[4] = vshrq_n_s32(lo[4], 31);
+ sign_hi[4] = vshrq_n_s32(hi[4], 31);
+ sign_lo[5] = vshrq_n_s32(lo[5], 31);
+ sign_hi[5] = vshrq_n_s32(hi[5], 31);
+ sign_lo[6] = vshrq_n_s32(lo[6], 31);
+ sign_hi[6] = vshrq_n_s32(hi[6], 31);
+ sign_lo[7] = vshrq_n_s32(lo[7], 31);
+ sign_hi[7] = vshrq_n_s32(hi[7], 31);
+
+ if (bit == 2) {
+ const int32x4_t const_rounding = vdupq_n_s32(1);
+ lo[0] = vaddq_s32(lo[0], const_rounding);
+ hi[0] = vaddq_s32(hi[0], const_rounding);
+ lo[1] = vaddq_s32(lo[1], const_rounding);
+ hi[1] = vaddq_s32(hi[1], const_rounding);
+ lo[2] = vaddq_s32(lo[2], const_rounding);
+ hi[2] = vaddq_s32(hi[2], const_rounding);
+ lo[3] = vaddq_s32(lo[3], const_rounding);
+ hi[3] = vaddq_s32(hi[3], const_rounding);
+ lo[4] = vaddq_s32(lo[4], const_rounding);
+ hi[4] = vaddq_s32(hi[4], const_rounding);
+ lo[5] = vaddq_s32(lo[5], const_rounding);
+ hi[5] = vaddq_s32(hi[5], const_rounding);
+ lo[6] = vaddq_s32(lo[6], const_rounding);
+ hi[6] = vaddq_s32(hi[6], const_rounding);
+ lo[7] = vaddq_s32(lo[7], const_rounding);
+ hi[7] = vaddq_s32(hi[7], const_rounding);
+ }
+
+ lo[0] = vsubq_s32(lo[0], sign_lo[0]);
+ hi[0] = vsubq_s32(hi[0], sign_hi[0]);
+ lo[1] = vsubq_s32(lo[1], sign_lo[1]);
+ hi[1] = vsubq_s32(hi[1], sign_hi[1]);
+ lo[2] = vsubq_s32(lo[2], sign_lo[2]);
+ hi[2] = vsubq_s32(hi[2], sign_hi[2]);
+ lo[3] = vsubq_s32(lo[3], sign_lo[3]);
+ hi[3] = vsubq_s32(hi[3], sign_hi[3]);
+ lo[4] = vsubq_s32(lo[4], sign_lo[4]);
+ hi[4] = vsubq_s32(hi[4], sign_hi[4]);
+ lo[5] = vsubq_s32(lo[5], sign_lo[5]);
+ hi[5] = vsubq_s32(hi[5], sign_hi[5]);
+ lo[6] = vsubq_s32(lo[6], sign_lo[6]);
+ hi[6] = vsubq_s32(hi[6], sign_hi[6]);
+ lo[7] = vsubq_s32(lo[7], sign_lo[7]);
+ hi[7] = vsubq_s32(hi[7], sign_hi[7]);
+
+ if (bit == 1) {
+ lo[0] = vshrq_n_s32(lo[0], 1);
+ hi[0] = vshrq_n_s32(hi[0], 1);
+ lo[1] = vshrq_n_s32(lo[1], 1);
+ hi[1] = vshrq_n_s32(hi[1], 1);
+ lo[2] = vshrq_n_s32(lo[2], 1);
+ hi[2] = vshrq_n_s32(hi[2], 1);
+ lo[3] = vshrq_n_s32(lo[3], 1);
+ hi[3] = vshrq_n_s32(hi[3], 1);
+ lo[4] = vshrq_n_s32(lo[4], 1);
+ hi[4] = vshrq_n_s32(hi[4], 1);
+ lo[5] = vshrq_n_s32(lo[5], 1);
+ hi[5] = vshrq_n_s32(hi[5], 1);
+ lo[6] = vshrq_n_s32(lo[6], 1);
+ hi[6] = vshrq_n_s32(hi[6], 1);
+ lo[7] = vshrq_n_s32(lo[7], 1);
+ hi[7] = vshrq_n_s32(hi[7], 1);
+ } else {
+ lo[0] = vshrq_n_s32(lo[0], 2);
+ hi[0] = vshrq_n_s32(hi[0], 2);
+ lo[1] = vshrq_n_s32(lo[1], 2);
+ hi[1] = vshrq_n_s32(hi[1], 2);
+ lo[2] = vshrq_n_s32(lo[2], 2);
+ hi[2] = vshrq_n_s32(hi[2], 2);
+ lo[3] = vshrq_n_s32(lo[3], 2);
+ hi[3] = vshrq_n_s32(hi[3], 2);
+ lo[4] = vshrq_n_s32(lo[4], 2);
+ hi[4] = vshrq_n_s32(hi[4], 2);
+ lo[5] = vshrq_n_s32(lo[5], 2);
+ hi[5] = vshrq_n_s32(hi[5], 2);
+ lo[6] = vshrq_n_s32(lo[6], 2);
+ hi[6] = vshrq_n_s32(hi[6], 2);
+ lo[7] = vshrq_n_s32(lo[7], 2);
+ hi[7] = vshrq_n_s32(hi[7], 2);
+ }
+}
+
+static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo,
+ int32x4_t *hi, int stride) {
+ vst1q_s32(output + 0 * stride, lo[0]);
+ vst1q_s32(output + 0 * stride + 4, hi[0]);
+ vst1q_s32(output + 1 * stride, lo[1]);
+ vst1q_s32(output + 1 * stride + 4, hi[1]);
+ vst1q_s32(output + 2 * stride, lo[2]);
+ vst1q_s32(output + 2 * stride + 4, hi[2]);
+ vst1q_s32(output + 3 * stride, lo[3]);
+ vst1q_s32(output + 3 * stride + 4, hi[3]);
+ vst1q_s32(output + 4 * stride, lo[4]);
+ vst1q_s32(output + 4 * stride + 4, hi[4]);
+ vst1q_s32(output + 5 * stride, lo[5]);
+ vst1q_s32(output + 5 * stride + 4, hi[5]);
+ vst1q_s32(output + 6 * stride, lo[6]);
+ vst1q_s32(output + 6 * stride + 4, hi[6]);
+ vst1q_s32(output + 7 * stride, lo[7]);
+ vst1q_s32(output + 7 * stride + 4, hi[7]);
+}
+
+static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/,
+ int32x4_t *hi /*[8]*/) {
+ int32x4_t s_lo[8], s_hi[8];
+ int32x4_t t_lo[8], t_hi[8];
+ int32x4_t x_lo[8], x_hi[8];
+ int64x2_t s64_lo[16], s64_hi[16];
+
+ x_lo[0] = lo[7];
+ x_hi[0] = hi[7];
+ x_lo[1] = lo[0];
+ x_hi[1] = hi[0];
+ x_lo[2] = lo[5];
+ x_hi[2] = hi[5];
+ x_lo[3] = lo[2];
+ x_hi[3] = hi[2];
+ x_lo[4] = lo[3];
+ x_hi[4] = hi[3];
+ x_lo[5] = lo[4];
+ x_hi[5] = hi[4];
+ x_lo[6] = lo[1];
+ x_hi[6] = hi[1];
+ x_lo[7] = lo[6];
+ x_hi[7] = hi[6];
+
+ // stage 1
+ // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64,
+ &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+ // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64,
+ &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+
+ // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+ // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64,
+ &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+ // fdct_round_shift, indices are doubled
+ t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+ t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+ t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+ t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+ t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+ t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+ t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+ t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+ t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+ t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+ t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+ t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+ t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+ t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+ t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+ t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+
+ // stage 2
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+ // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64,
+ &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+ // fdct_round_shift
+ // s0 + s2
+ t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+ // s0 - s2
+ t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+
+ // s1 + s3
+ t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+ // s1 - s3
+ t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+
+ // s4 + s6
+ t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+ // s4 - s6
+ t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+
+ // s5 + s7
+ t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+ // s5 - s7
+ t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+
+ // stage 3
+ // s2 = cospi_16_64 * (x2 + x3)
+ // s3 = cospi_16_64 * (x2 - x3)
+ butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64,
+ &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]);
+
+ // s6 = cospi_16_64 * (x6 + x7)
+ // s7 = cospi_16_64 * (x6 - x7)
+ butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64,
+ &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]);
+
+ // x0, x2, x4, x6 pass through
+ lo[0] = t_lo[0];
+ hi[0] = t_hi[0];
+ lo[2] = s_lo[6];
+ hi[2] = s_hi[6];
+ lo[4] = s_lo[3];
+ hi[4] = s_hi[3];
+ lo[6] = t_lo[5];
+ hi[6] = t_hi[5];
+
+ lo[1] = vnegq_s32(t_lo[4]);
+ hi[1] = vnegq_s32(t_hi[4]);
+ lo[3] = vnegq_s32(s_lo[2]);
+ hi[3] = vnegq_s32(s_hi[2]);
+ lo[5] = vnegq_s32(s_lo[7]);
+ hi[5] = vnegq_s32(s_hi[7]);
+ lo[7] = vnegq_s32(t_lo[1]);
+ hi[7] = vnegq_s32(t_hi[1]);
+
+ transpose_s32_8x8_2(lo, hi, lo, hi);
+}
+
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ int32x4_t lo[8], hi[8];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break;
+ case ADST_DCT:
+ highbd_load_buffer_8x8(input, lo, hi, stride);
+ highbd_fadst8x8_neon(lo, hi);
+ // pass1 variant is not precise enough
+ vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+ highbd_right_shift_8x8(lo, hi, 1);
+ highbd_write_buffer_8x8(output, lo, hi, 8);
+ break;
+ case DCT_ADST:
+ highbd_load_buffer_8x8(input, lo, hi, stride);
+ // pass1 variant is not precise enough
+ vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+ highbd_fadst8x8_neon(lo, hi);
+ highbd_right_shift_8x8(lo, hi, 1);
+ highbd_write_buffer_8x8(output, lo, hi, 8);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ highbd_load_buffer_8x8(input, lo, hi, stride);
+ highbd_fadst8x8_neon(lo, hi);
+ highbd_fadst8x8_neon(lo, hi);
+ highbd_right_shift_8x8(lo, hi, 1);
+ highbd_write_buffer_8x8(output, lo, hi, 8);
+ break;
+ }
+}
+
+static INLINE void highbd_load_buffer_16x16(
+ const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+ int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+ // load first 8 columns
+ highbd_load_buffer_8x8(input, left1, right1, stride);
+ highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride);
+
+ input += 8;
+ // load second 8 columns
+ highbd_load_buffer_8x8(input, left2, right2, stride);
+ highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_write_buffer_16x16(
+ tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+ int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+ // write first 8 columns
+ highbd_write_buffer_8x8(output, left1, right1, stride);
+ highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride);
+
+ // write second 8 columns
+ output += 8;
+ highbd_write_buffer_8x8(output, left2, right2, stride);
+ highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/,
+ int32x4_t *right1 /*[16]*/,
+ int32x4_t *left2 /*[16]*/,
+ int32x4_t *right2 /*[16]*/,
+ const int bit) {
+ // perform rounding operations
+ highbd_right_shift_8x8(left1, right1, bit);
+ highbd_right_shift_8x8(left1 + 8, right1 + 8, bit);
+ highbd_right_shift_8x8(left2, right2, bit);
+ highbd_right_shift_8x8(left2 + 8, right2 + 8, bit);
+}
+
+static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) {
+ // perform 16x16 1-D DCT for 8 columns
+ int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8];
+ int32x4_t left8[8], right8[8];
+
+ // stage 1
+ left8[0] = vaddq_s32(left[0], left[15]);
+ right8[0] = vaddq_s32(right[0], right[15]);
+ left8[1] = vaddq_s32(left[1], left[14]);
+ right8[1] = vaddq_s32(right[1], right[14]);
+ left8[2] = vaddq_s32(left[2], left[13]);
+ right8[2] = vaddq_s32(right[2], right[13]);
+ left8[3] = vaddq_s32(left[3], left[12]);
+ right8[3] = vaddq_s32(right[3], right[12]);
+ left8[4] = vaddq_s32(left[4], left[11]);
+ right8[4] = vaddq_s32(right[4], right[11]);
+ left8[5] = vaddq_s32(left[5], left[10]);
+ right8[5] = vaddq_s32(right[5], right[10]);
+ left8[6] = vaddq_s32(left[6], left[9]);
+ right8[6] = vaddq_s32(right[6], right[9]);
+ left8[7] = vaddq_s32(left[7], left[8]);
+ right8[7] = vaddq_s32(right[7], right[8]);
+
+ // step 1
+ s1_lo[0] = vsubq_s32(left[7], left[8]);
+ s1_hi[0] = vsubq_s32(right[7], right[8]);
+ s1_lo[1] = vsubq_s32(left[6], left[9]);
+ s1_hi[1] = vsubq_s32(right[6], right[9]);
+ s1_lo[2] = vsubq_s32(left[5], left[10]);
+ s1_hi[2] = vsubq_s32(right[5], right[10]);
+ s1_lo[3] = vsubq_s32(left[4], left[11]);
+ s1_hi[3] = vsubq_s32(right[4], right[11]);
+ s1_lo[4] = vsubq_s32(left[3], left[12]);
+ s1_hi[4] = vsubq_s32(right[3], right[12]);
+ s1_lo[5] = vsubq_s32(left[2], left[13]);
+ s1_hi[5] = vsubq_s32(right[2], right[13]);
+ s1_lo[6] = vsubq_s32(left[1], left[14]);
+ s1_hi[6] = vsubq_s32(right[1], right[14]);
+ s1_lo[7] = vsubq_s32(left[0], left[15]);
+ s1_hi[7] = vsubq_s32(right[0], right[15]);
+
+ // pass1 variant is not accurate enough
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8);
+
+ // step 2
+ // step2[2] = (step1[5] - step1[2]) * cospi_16_64;
+ // step2[5] = (step1[5] + step1[2]) * cospi_16_64;
+ butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+ cospi_16_64, &s2_lo[5], &s2_hi[5],
+ &s2_lo[2], &s2_hi[2]);
+ // step2[3] = (step1[4] - step1[3]) * cospi_16_64;
+ // step2[4] = (step1[4] + step1[3]) * cospi_16_64;
+ butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+ cospi_16_64, &s2_lo[4], &s2_hi[4],
+ &s2_lo[3], &s2_hi[3]);
+
+ // step 3
+ s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]);
+ s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]);
+ s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]);
+ s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]);
+ s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]);
+ s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]);
+ s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]);
+ s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]);
+ s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]);
+ s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]);
+ s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]);
+ s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]);
+ s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]);
+ s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]);
+ s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]);
+ s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]);
+
+ // step 4
+ // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1]
+ // s2[6] = cospi_8_64 * s3[6] + cospi_24_64 * s3[1]
+ butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1],
+ cospi_8_64, cospi_24_64, &s2_lo[6],
+ &s2_hi[6], &s2_lo[1], &s2_hi[1]);
+
+ // s2[5] = cospi_8_64 * s3[2] - cospi_24_64 * s3[5]
+ // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5]
+ butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5],
+ cospi_24_64, cospi_8_64, &s2_lo[2],
+ &s2_hi[2], &s2_lo[5], &s2_hi[5]);
+
+ // step 5
+ s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]);
+ s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]);
+ s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]);
+ s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]);
+ s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]);
+ s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]);
+ s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]);
+ s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]);
+ s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]);
+ s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]);
+ s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]);
+ s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]);
+ s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]);
+ s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]);
+ s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]);
+ s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]);
+
+ // step 6
+ // out[1] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+ // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+
+ // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+ // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+
+ // out[5] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+ // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
+
+ // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+ // out[3] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+
+ left[0] = left8[0];
+ right[0] = right8[0];
+ left[2] = left8[1];
+ right[2] = right8[1];
+ left[4] = left8[2];
+ right[4] = right8[2];
+ left[6] = left8[3];
+ right[6] = right8[3];
+ left[8] = left8[4];
+ right[8] = right8[4];
+ left[10] = left8[5];
+ right[10] = right8[5];
+ left[12] = left8[6];
+ right[12] = right8[6];
+ left[14] = left8[7];
+ right[14] = right8[7];
+}
+
+static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) {
+ // perform 16x16 1-D ADST for 8 columns
+ int32x4_t x_lo[16], x_hi[16];
+ int32x4_t s_lo[16], s_hi[16];
+ int32x4_t t_lo[16], t_hi[16];
+ int64x2_t s64_lo[32], s64_hi[32];
+
+ x_lo[0] = left[15];
+ x_hi[0] = right[15];
+ x_lo[1] = left[0];
+ x_hi[1] = right[0];
+ x_lo[2] = left[13];
+ x_hi[2] = right[13];
+ x_lo[3] = left[2];
+ x_hi[3] = right[2];
+ x_lo[4] = left[11];
+ x_hi[4] = right[11];
+ x_lo[5] = left[4];
+ x_hi[5] = right[4];
+ x_lo[6] = left[9];
+ x_hi[6] = right[9];
+ x_lo[7] = left[6];
+ x_hi[7] = right[6];
+ x_lo[8] = left[7];
+ x_hi[8] = right[7];
+ x_lo[9] = left[8];
+ x_hi[9] = right[8];
+ x_lo[10] = left[5];
+ x_hi[10] = right[5];
+ x_lo[11] = left[10];
+ x_hi[11] = right[10];
+ x_lo[12] = left[3];
+ x_hi[12] = right[3];
+ x_lo[13] = left[12];
+ x_hi[13] = right[12];
+ x_lo[14] = left[1];
+ x_hi[14] = right[1];
+ x_lo[15] = left[14];
+ x_hi[15] = right[14];
+
+ // stage 1, indices are doubled
+ // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+ // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64,
+ &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+ // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+ // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64,
+ &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+ // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+ // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+ // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+ // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64,
+ &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+ // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+ // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64,
+ &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+ // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+ // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64,
+ &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+ // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+ // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64,
+ &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+ // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+ // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64,
+ &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]);
+
+ // fdct_round_shift, indices are doubled
+ t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+ t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+ t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+ t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+ t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+ t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+ t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+ t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+ t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+ t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+ t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+ t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+ t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+ t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+ t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+ t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+ t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+ t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+ t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+ t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+ t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+ t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+ t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+ t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+ t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+ t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+ t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+ t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+ t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+ t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+ t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+ t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+
+ // stage 2
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ s_lo[4] = t_lo[4];
+ s_hi[4] = t_hi[4];
+ s_lo[5] = t_lo[5];
+ s_hi[5] = t_hi[5];
+ s_lo[6] = t_lo[6];
+ s_hi[6] = t_hi[6];
+ s_lo[7] = t_lo[7];
+ s_hi[7] = t_hi[7];
+ // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64,
+ &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+ // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64,
+ &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+ // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64,
+ &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]);
+ // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64,
+ &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+ // s0 + s4
+ t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]);
+ t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]);
+ // s1 + s5
+ t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]);
+ t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]);
+ // s2 + s6
+ t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]);
+ t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]);
+ // s3 + s7
+ t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]);
+ t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+ // s0 - s4
+ t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]);
+ t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]);
+ // s1 - s5
+ t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]);
+ t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]);
+ // s2 - s6
+ t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]);
+ t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]);
+ // s3 - s7
+ t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]);
+ t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+ // fdct_round_shift()
+ // s8 + s12
+ t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+ t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+ // s9 + s13
+ t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+ t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+ // s10 + s14
+ t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+ t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+ // s11 + s15
+ t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+ t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+ // s8 - s12
+ t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+ t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+ // s9 - s13
+ t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+ t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+ // s10 - s14
+ t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+ t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+ // s11 - s15
+ t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+ t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+ // stage 3
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+ // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64,
+ &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]);
+ s_lo[8] = t_lo[8];
+ s_hi[8] = t_hi[8];
+ s_lo[9] = t_lo[9];
+ s_hi[9] = t_hi[9];
+ s_lo[10] = t_lo[10];
+ s_hi[10] = t_hi[10];
+ s_lo[11] = t_lo[11];
+ s_hi[11] = t_hi[11];
+ // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64,
+ &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+ // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64,
+ &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+ // s0 + s2
+ t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+ // s1 + s3
+ t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+ // s0 - s2
+ t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+ // s1 - s3
+ t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+ // fdct_round_shift()
+ // s4 + s6
+ t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+ // s5 + s7
+ t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+ // s4 - s6
+ t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+ // s5 - s7
+ t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+ // s8 + s10
+ t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]);
+ t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]);
+ // s9 + s11
+ t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]);
+ t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]);
+ // s8 - s10
+ t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]);
+ t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]);
+ // s9 - s11
+ t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]);
+ t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]);
+ // fdct_round_shift()
+ // s12 + s14
+ t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+ t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+ // s13 + s15
+ t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+ t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+ // s12 - s14
+ t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+ t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+ // s13 - s15
+ t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+ t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+
+ // stage 4, with fdct_round_shift
+ // s2 = (-cospi_16_64) * (x2 + x3);
+ // s3 = cospi_16_64 * (x2 - x3);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+ -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3],
+ &x_hi[3]);
+ // s6 = cospi_16_64 * (x6 + x7);
+ // s7 = cospi_16_64 * (-x6 + x7);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+ cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7],
+ &x_hi[7]);
+ // s10 = cospi_16_64 * (x10 + x11);
+ // s11 = cospi_16_64 * (-x10 + x11);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+ cospi_16_64, &x_lo[10], &x_hi[10],
+ &x_lo[11], &x_hi[11]);
+ // s14 = (-cospi_16_64) * (x14 + x15);
+ // s15 = cospi_16_64 * (x14 - x15);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ -cospi_16_64, &x_lo[14], &x_hi[14],
+ &x_lo[15], &x_hi[15]);
+
+ // Just copy x0, x1, x4, x5, x8, x9, x12, x13
+ x_lo[0] = t_lo[0];
+ x_hi[0] = t_hi[0];
+ x_lo[1] = t_lo[1];
+ x_hi[1] = t_hi[1];
+ x_lo[4] = t_lo[4];
+ x_hi[4] = t_hi[4];
+ x_lo[5] = t_lo[5];
+ x_hi[5] = t_hi[5];
+ x_lo[8] = t_lo[8];
+ x_hi[8] = t_hi[8];
+ x_lo[9] = t_lo[9];
+ x_hi[9] = t_hi[9];
+ x_lo[12] = t_lo[12];
+ x_hi[12] = t_hi[12];
+ x_lo[13] = t_lo[13];
+ x_hi[13] = t_hi[13];
+
+ left[0] = x_lo[0];
+ right[0] = x_hi[0];
+ left[1] = vnegq_s32(x_lo[8]);
+ right[1] = vnegq_s32(x_hi[8]);
+ left[2] = x_lo[12];
+ right[2] = x_hi[12];
+ left[3] = vnegq_s32(x_lo[4]);
+ right[3] = vnegq_s32(x_hi[4]);
+ left[4] = x_lo[6];
+ right[4] = x_hi[6];
+ left[5] = x_lo[14];
+ right[5] = x_hi[14];
+ left[6] = x_lo[10];
+ right[6] = x_hi[10];
+ left[7] = x_lo[2];
+ right[7] = x_hi[2];
+ left[8] = x_lo[3];
+ right[8] = x_hi[3];
+ left[9] = x_lo[11];
+ right[9] = x_hi[11];
+ left[10] = x_lo[15];
+ right[10] = x_hi[15];
+ left[11] = x_lo[7];
+ right[11] = x_hi[7];
+ left[12] = x_lo[5];
+ right[12] = x_hi[5];
+ left[13] = vnegq_s32(x_lo[13]);
+ right[13] = vnegq_s32(x_hi[13]);
+ left[14] = x_lo[9];
+ right[14] = x_hi[9];
+ left[15] = vnegq_s32(x_lo[1]);
+ right[15] = vnegq_s32(x_hi[1]);
+}
+
+static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1,
+ int32x4_t *left2, int32x4_t *right2) {
+ // Left half.
+ highbd_fdct16_8col(left1, right1);
+ // Right half.
+ highbd_fdct16_8col(left2, right2);
+ transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1,
+ int32x4_t *left2, int32x4_t *right2) {
+ // Left half.
+ highbd_fadst16_8col(left1, right1);
+ // Right half.
+ highbd_fadst16_8col(left2, right2);
+ transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ int32x4_t left1[16], right1[16], left2[16], right2[16];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break;
+ case ADST_DCT:
+ highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+ highbd_fdct16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ break;
+ case DCT_ADST:
+ highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+ highbd_fdct16x16_neon(left1, right1, left2, right2);
+ highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ break;
+ }
+}
+
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
index 53e8c7e49..d631cd437 100644
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,7 +21,7 @@
// Compute the sum of all pixel differences of this MB.
static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_s8(v_sum_diff_total);
#else
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 33753f77b..b82b3f9db 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
return result;
}
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
- // This is simplified from the C implementation to utilise that
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
- return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
- int *const comp_cost[2]) {
- assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
- assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
- return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
- comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
- int sad_per_bit) {
- const int_mv diff =
- pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
- return ROUND_POWER_OF_TWO(
- (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
- VP9_PROB_COST_SHIFT);
-}
-
/*****************************************************************************
* This function utilizes 3 properties of the cost function lookup tables, *
* constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
@@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
*****************************************************************************/
int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
const search_site_config *cfg, MV *ref_mv,
- MV *best_mv, int search_param, int sad_per_bit,
- int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+ uint32_t start_mv_sad, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
static const uint32_t data[4] = { 0, 1, 2, 3 };
const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
- const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
- const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+ const int ref_row = ref_mv->row;
+ const int ref_col = ref_mv->col;
int_mv bmv = pack_int_mv(ref_row, ref_col);
int_mv new_bmv = bmv;
@@ -117,12 +94,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Work out the start point for the search
const uint8_t *best_address = in_what;
const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
#else
int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
#endif
- unsigned int best_sad = INT_MAX;
+ // Starting position
+ unsigned int best_sad = start_mv_sad;
int i, j, step;
// Check the prerequisite cost function properties that are easy to check
@@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
- // Check the starting position
- best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
- best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
*num00 = 0;
for (i = 0, step = 0; step < tot_steps; step++) {
@@ -143,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
int8x16_t v_inside_d;
uint32x4_t v_outside_d;
int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
int64x2_t v_blocka[2];
#else
int32x4_t v_blocka[1];
@@ -164,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
vreinterpretq_s32_s16(v_these_mv_w)));
// If none of them are inside, then move on
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
#else
horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
@@ -193,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Compute the SIMD pointer offsets.
{
-#if defined(__aarch64__) // sizeof(intptr_t) == 8
+#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8
// Load the offsets
int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
@@ -214,13 +188,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
#endif
}
- fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
- in_what_stride, (uint32_t *)&v_sad_d);
+ sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
// Look up the component cost of the residual motion vector
{
uint32_t cost[4];
- int16_t __attribute__((aligned(16))) rowcol[8];
+ DECLARE_ALIGNED(16, int16_t, rowcol[8]);
vst1q_s16(rowcol, v_diff_mv_w);
// Note: This is a use case for gather instruction
@@ -260,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Find the minimum value and index horizontally in v_sad_d
{
uint32_t local_best_sad;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
#else
uint32x2_t horiz_min_0 =
@@ -282,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
local_best_idx = vminvq_u32(v_mask_d);
#else
horiz_min_0 =
@@ -306,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
best_address = new_best_address;
v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
v_ba_q = vdupq_n_s64((intptr_t)best_address);
#else
v_ba_d = vdupq_n_s32((intptr_t)best_address);
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c
index 1c7503139..0cf0bf250 100644
--- a/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -12,30 +12,91 @@
#include <assert.h>
#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
- int block_size) {
- int64x2_t error = vdupq_n_s64(0);
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
- assert(block_size >= 8);
- assert((block_size % 8) == 0);
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
do {
- const int16x8_t c = vld1q_s16(coeff);
- const int16x8_t d = vld1q_s16(dqcoeff);
- const int16x8_t diff = vsubq_s16(c, d);
- const int16x4_t diff_lo = vget_low_s16(diff);
- const int16x4_t diff_hi = vget_high_s16(diff);
- // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+ uint32x4_t err;
+ int32x4_t ssz0, ssz1;
+
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
// accumulating them in 64-bits.
- const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
- const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
- const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
- error = vaddq_s64(error, err2);
- coeff += 8;
- dqcoeff += 8;
- block_size -= 8;
+ err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ // We can't do the same here as we're operating on signed integers, so we
+ // can store 2 15-bit diff before accumulating into 64-bits.
+ ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+ ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+ ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+ ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_int64x2(ssz_s64);
+ return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, int block_size) {
+ uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ uint32x4_t err0, err1;
+
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits
+ // before accumulating them in 64-bits. However splitting into 2 mull, mlal
+ // pairs is beneficial since it allows us to use both Neon
+ // multiply-accumulate pipes - on CPUs that have them - rather than having
+ // a single chain of 4 instructions executing serially.
+ err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0));
+ err_u64[0] = vpadalq_u32(err_u64[0], err0);
+
+ err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1));
+ err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64[1] = vpadalq_u32(err_u64[1], err1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
} while (block_size != 0);
- return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+ return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1]));
}
diff --git a/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
new file mode 100644
index 000000000..d9b183472
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bd) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ const int shift = 2 * (bd - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int32x4_t c = load_tran_low_to_s32q(coeff);
+ const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
+
+ const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+ err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+ err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+ ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+ ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+ coeff += 4;
+ dqcoeff += 4;
+ block_size -= 4;
+ } while (block_size != 0);
+
+ *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
+ return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
+}
diff --git a/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
new file mode 100644
index 000000000..c3aef3c86
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+ uint32_t *dst) {
+ const uint16x8_t a_reg = vld1q_u16(a);
+ const uint16x8_t b_reg = vld1q_u16(b);
+
+ uint16x8_t dist = vabdq_u16(a_reg, b_reg);
+ uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist));
+ uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist));
+
+ vst1q_u32(dst, dist_first);
+ vst1q_u32(dst + 4, dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) {
+ uint32x4_t dist_reg, dist_left, dist_right;
+
+ dist_reg = vld1q_u32(dist);
+ dist_left = vld1q_u32(dist - 1);
+ dist_right = vld1q_u32(dist + 1);
+
+ *sum = vaddq_u32(dist_reg, dist_left);
+ *sum = vaddq_u32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first,
+ uint32x4_t *sum_second) {
+ highbd_get_sum_4(dist, sum_first);
+ highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum,
+ const uint32x4_t *mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32);
+ const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32);
+ const uint32x4_t weight_u32 = vdupq_n_u32(weight);
+ const uint32x4_t sixteen = vdupq_n_u32(16);
+ uint32x4_t sum2;
+
+ // modifier * 3 / index;
+ uint64x2_t sum_lo =
+ vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants));
+ uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum),
+ vget_high_u32(*mul_constants));
+
+ // we cannot use vshrn_n_u64 as strength is not known at compile time.
+ sum_lo = vshlq_u64(sum_lo, strength_s64);
+ sum_hi = vshlq_u64(sum_hi, strength_s64);
+
+ sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi));
+
+ // Multiply with the weight
+ sum2 = vminq_u32(sum2, sixteen);
+ sum2 = vsubq_u32(sixteen, sum2);
+ *output = vmulq_u32(sum2, weight_u32);
+}
+
+static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1,
+ const uint32x4_t sum_0_u32,
+ const uint32x4_t sum_1_u32,
+ const uint32x4_t *mul_constants_0,
+ const uint32x4_t *mul_constants_1,
+ const int strength, const int rounding,
+ const int weight) {
+ highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+ weight);
+ highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+ weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(
+ const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32,
+ const uint16_t *pred, uint16_t *count, uint32_t *accumulator) {
+ const uint16x8_t sum_u16 =
+ vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32));
+ uint16x8_t pred_u16 = vld1q_u16(pred);
+ uint16x8_t count_u16 = vld1q_u16(count);
+ uint32x4_t pred_0_u32, pred_1_u32;
+ uint32x4_t accum_0_u32, accum_1_u32;
+
+ count_u16 = vqaddq_u16(count_u16, sum_u16);
+ vst1q_u16(count, count_u16);
+
+ accum_0_u32 = vld1q_u32(accumulator);
+ accum_1_u32 = vld1q_u32(accumulator + 4);
+
+ pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16));
+ pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16));
+
+ // Don't use sum_u16 as that produces different results to the C version
+ accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32);
+ accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32);
+
+ vst1q_u32(accumulator, accum_0_u32);
+ vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist,
+ uint32x4_t *dist_reg) {
+ *dist_reg = vld1q_u32(dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist,
+ uint32x4_t *reg_first,
+ uint32x4_t *reg_second) {
+ highbd_read_dist_4(dist, reg_first);
+ highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+ int ss_x, const uint32_t *u_dist, const uint32_t *v_dist,
+ uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first,
+ uint32x4_t *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 8 entries from chroma.
+ highbd_read_dist_8(u_dist, u_first, u_second);
+ highbd_read_dist_8(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ uint32x4_t u_reg, v_reg;
+ uint32x4x2_t pair;
+
+ highbd_read_dist_4(u_dist, &u_reg);
+
+ pair = vzipq_u32(u_reg, u_reg);
+ *u_first = pair.val[0];
+ *u_second = pair.val[1];
+
+ highbd_read_dist_4(v_dist, &v_reg);
+
+ pair = vzipq_u32(v_reg, v_reg);
+ *v_first = pair.val[0];
+ *v_second = pair.val[1];
+ }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_first,
+ const uint32_t *const *neighbors_second, int top_weight,
+ int bottom_weight) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ uint32x4_t mul_first, mul_second;
+
+ uint32x4_t sum_row_1_first, sum_row_1_second;
+ uint32x4_t sum_row_2_first, sum_row_2_second;
+ uint32x4_t sum_row_3_first, sum_row_3_second;
+
+ uint32x4_t u_first, u_second;
+ uint32x4_t v_first, v_second;
+
+ uint32x4_t sum_row_first;
+ uint32x4_t sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(block_width == 8);
+
+ (void)block_width;
+
+ // First row
+ mul_first = vld1q_u32(neighbors_first[0]);
+ mul_second = vld1q_u32(neighbors_second[0]);
+
+ // Add luma values
+ highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+ // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+ sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first);
+ sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+ sum_row_first = vaddq_u32(sum_row_first, u_first);
+ sum_row_second = vaddq_u32(sum_row_second, u_second);
+
+ sum_row_first = vaddq_u32(sum_row_first, v_first);
+ sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+ sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = vld1q_u32(neighbors_first[1]);
+ mul_second = vld1q_u32(neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ weight = bottom_weight;
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first);
+ sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = vaddq_u32(sum_row_first, u_first);
+ sum_row_second = vaddq_u32(sum_row_second, u_second);
+ sum_row_first = vaddq_u32(sum_row_first, v_first);
+ sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+ sum_row_second, &mul_first, &mul_second, strength,
+ rounding, weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = vld1q_u32(neighbors_first[0]);
+ mul_second = vld1q_u32(neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+ }
+
+ sum_row_first = vaddq_u32(sum_row_first, u_first);
+ sum_row_second = vaddq_u32(sum_row_second, u_second);
+ sum_row_first = vaddq_u32(sum_row_first, v_first);
+ sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+ sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_first;
+ const uint32_t *const *neighbors_second;
+
+ // Left
+ neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+ }
+
+ // Right
+ neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+ const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst,
+ uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) {
+ uint32x4_t y_reg_fst, y_reg_snd;
+ if (!ss_x) {
+ highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+ if (ss_y == 1) {
+ uint32x4_t y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+ y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst);
+ y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd);
+ }
+ } else {
+ // Temporary
+ uint32x4_t y_fst, y_snd;
+ uint64x2_t y_fst64, y_snd64;
+
+ // First 8
+ highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ uint32x4_t y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = vaddq_u32(y_fst, y_tmp_fst);
+ y_snd = vaddq_u32(y_snd, y_tmp_snd);
+ }
+
+ y_fst64 = vpaddlq_u32(y_fst);
+ y_snd64 = vpaddlq_u32(y_snd);
+ y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+
+ // Second 8
+ highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ uint32x4_t y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = vaddq_u32(y_fst, y_tmp_fst);
+ y_snd = vaddq_u32(y_snd, y_tmp_snd);
+ }
+
+ y_fst64 = vpaddlq_u32(y_fst);
+ y_snd64 = vpaddlq_u32(y_snd);
+ y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+ }
+
+ *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst);
+ *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd);
+ *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst);
+ *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+ int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+ const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+ int top_weight, int bottom_weight, const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ uint32x4_t mul_fst, mul_snd;
+
+ uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+ uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+ uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+ uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+ uint32x4_t u_sum_row_fst, v_sum_row_fst;
+ uint32x4_t u_sum_row_snd, v_sum_row_snd;
+
+ // Loop variable
+ unsigned int h;
+
+ (void)uv_block_width;
+
+ // First row
+ mul_fst = vld1q_u32(neighbors_fst[0]);
+ mul_snd = vld1q_u32(neighbors_snd[0]);
+
+ // Add chroma values
+ highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+ u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+ highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+ v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst);
+ v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+ u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+ v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul_fst = vld1q_u32(neighbors_fst[1]);
+ mul_snd = vld1q_u32(neighbors_snd[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+ u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd);
+
+ v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+ v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+ v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst);
+ v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+ u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+ v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul_fst = vld1q_u32(neighbors_fst[0]);
+ mul_snd = vld1q_u32(neighbors_snd[0]);
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+ v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+ v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+ u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+ v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_fst;
+ const uint32_t *const *neighbors_snd;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ } else {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_neon(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+ uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+ highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw,
+ use_whole_blk, y_accum, y_count, y_dist_ptr,
+ u_dist_ptr, v_dist_ptr);
+
+ highbd_apply_temporal_filter_chroma(
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index c2b55fcba..96d061436 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -11,11 +11,13 @@
#include <arm_neon.h>
#include <assert.h>
#include <math.h>
+#include <stdint.h>
#include "./vpx_config.h"
#include "vpx_mem/vpx_mem.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/encoder/vp9_encoder.h"
@@ -50,7 +52,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
}
static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
return (uint16_t)vmaxvq_s16(v_eobmax);
#else
const int16x4_t v_eobmax_3210 =
@@ -65,23 +67,21 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
}
-static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *dequant_ptr,
- int16x8_t *round, int16x8_t *quant,
- int16x8_t *dequant) {
- *round = vld1q_s16(round_ptr);
- *quant = vld1q_s16(quant_ptr);
+static VPX_FORCE_INLINE void load_fp_values(
+ const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+ int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+ *round = vld1q_s16(mb_plane->round_fp);
+ *quant = vld1q_s16(mb_plane->quant_fp);
*dequant = vld1q_s16(dequant_ptr);
}
static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
int16x8_t *v_quant,
int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*v_round = vdupq_laneq_s16(*v_round, 1);
*v_quant = vdupq_laneq_s16(*v_quant, 1);
*v_dequant = vdupq_laneq_s16(*v_dequant, 1);
@@ -117,27 +117,26 @@ static VPX_FORCE_INLINE void quantize_fp_8(
*v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
}
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
int i;
int16x8_t v_eobmax = vdupq_n_s16(-1);
int16x8_t v_round, v_quant, v_dequant;
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
- load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
- &v_dequant);
+ load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
// process dc and the first seven ac coeffs
quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
dqcoeff_ptr, &v_eobmax);
// now process the rest of the ac coeffs
update_fp_values(&v_round, &v_quant, &v_dequant);
- for (i = 8; i < count; i += 8) {
+ for (i = 8; i < n_coeffs; i += 8) {
quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
}
@@ -186,23 +185,22 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8(
*v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
}
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
int16x8_t eob_max = vdupq_n_s16(-1);
// ROUND_POWER_OF_TWO(round_ptr[], 1)
- int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
- int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+ int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
int16x8_t dequant = vld1q_s16(dequant_ptr);
// dequant >> 2 is used similar to zbin as a threshold.
int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
int i;
+ const int16_t *iscan = scan_order->iscan;
- (void)scan;
- (void)count;
+ (void)n_coeffs;
// Process dc and the first seven ac coeffs.
quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
@@ -258,23 +256,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
}
void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+ const struct macroblock_plane *mb_plane,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const int16x4_t v_zero = vdup_n_s16(0);
- const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
const int16x4_t v_dequant = vld1_s16(dequant_ptr);
- const int16x4_t v_round = vld1_s16(round_ptr);
+ const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
uint16x4_t v_mask_lo, v_mask_hi;
int16x8_t v_eobmax = vdupq_n_s16(-1);
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// DC and first 3 AC
v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
@@ -349,22 +345,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
}
void vp9_highbd_quantize_fp_32x32_neon(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
- const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
- const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
+ const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
const int16x4_t v_dequant = vld1_s16(dequant_ptr);
const int16x4_t v_zero = vdup_n_s16(0);
const int16x4_t v_round =
- vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+ vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
uint16x4_t v_mask_lo, v_mask_hi;
int16x8_t v_eobmax = vdupq_n_s16(-1);
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// DC and first 3 AC
v_mask_lo =
diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
new file mode 100644
index 000000000..a651a15d9
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
@@ -0,0 +1,849 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const uint8x8_t a_reg = vld1_u8(a);
+ const uint8x8_t b_reg = vld1_u8(b);
+
+ uint16x8_t dist_first = vabdl_u8(a_reg, b_reg);
+ dist_first = vmulq_u16(dist_first, dist_first);
+
+ vst1q_u16(dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const uint8x16_t a_reg = vld1q_u8(a);
+ const uint8x16_t b_reg = vld1q_u8(b);
+
+ uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg));
+ uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg));
+ dist_first = vmulq_u16(dist_first, dist_first);
+ dist_second = vmulq_u16(dist_second, dist_second);
+
+ vst1q_u16(dst, dist_first);
+ vst1q_u16(dst + 8, dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) {
+ *dist_reg = vld1q_u16(dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first,
+ uint16x8_t *reg_second) {
+ read_dist_8(dist, reg_first);
+ read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE uint16x8_t average_8(uint16x8_t sum,
+ const uint16x8_t *mul_constants,
+ const int strength, const int rounding,
+ const uint16x8_t *weight) {
+ const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16);
+ const uint16x8_t weight_u16 = *weight;
+ const uint16x8_t sixteen = vdupq_n_u16(16);
+ const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16);
+
+ // modifier * 3 / index;
+ uint32x4_t sum_hi =
+ vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants));
+ uint32x4_t sum_lo =
+ vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants));
+
+ sum_lo = vqaddq_u32(sum_lo, rounding_u32);
+ sum_hi = vqaddq_u32(sum_hi, rounding_u32);
+
+ // we cannot use vshrn_n_u32 as strength is not known at compile time.
+ sum_lo = vshlq_u32(sum_lo, strength_u32);
+ sum_hi = vshlq_u32(sum_hi, strength_u32);
+
+ sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo));
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = vminq_u16(sum, sixteen);
+ sum = vsubq_u16(sixteen, sum);
+ return vmulq_u16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const uint16x8_t sum_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred));
+ uint16x8_t count_u16 = vld1q_u16(count);
+ uint32x4_t accum_0_u32, accum_1_u32;
+
+ count_u16 = vqaddq_u16(count_u16, sum_u16);
+ vst1q_u16(count, count_u16);
+
+ accum_0_u32 = vld1q_u32(accumulator);
+ accum_1_u32 = vld1q_u32(accumulator + 4);
+
+ accum_0_u32 =
+ vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16));
+ accum_1_u32 =
+ vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16));
+
+ vst1q_u32(accumulator, accum_0_u32);
+ vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16,
+ const uint16x8_t sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ uint8x16_t pred_u8 = vld1q_u8(pred);
+ uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8));
+ uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8));
+ uint16x8_t count_0_u16 = vld1q_u16(count);
+ uint16x8_t count_1_u16 = vld1q_u16(count + 8);
+ uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16);
+ vst1q_u16(count, count_0_u16);
+ count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16);
+ vst1q_u16(count + 8, count_1_u16);
+
+ accum_0_u32 = vld1q_u32(accumulator);
+ accum_1_u32 = vld1q_u32(accumulator + 4);
+ accum_2_u32 = vld1q_u32(accumulator + 8);
+ accum_3_u32 = vld1q_u32(accumulator + 12);
+
+ accum_0_u32 =
+ vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16));
+ accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16),
+ vget_high_u16(pred_0_u16));
+ accum_2_u32 =
+ vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16));
+ accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16),
+ vget_high_u16(pred_1_u16));
+
+ vst1q_u32(accumulator, accum_0_u32);
+ vst1q_u32(accumulator + 4, accum_1_u32);
+ vst1q_u32(accumulator + 8, accum_2_u32);
+ vst1q_u32(accumulator + 12, accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) {
+ uint16x8_t dist_reg, dist_left, dist_right;
+
+ dist_reg = vld1q_u16(y_dist);
+ dist_left = vld1q_u16(y_dist - 1);
+ dist_right = vld1q_u16(y_dist + 1);
+
+ *sum = vqaddq_u16(dist_reg, dist_left);
+ *sum = vqaddq_u16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first,
+ uint16x8_t *sum_second) {
+ get_sum_8(y_dist, sum_first);
+ get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+ const uint16_t *v_dist,
+ uint16x8_t *u_first,
+ uint16x8_t *u_second,
+ uint16x8_t *v_first,
+ uint16x8_t *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 16 entries from chroma.
+ read_dist_16(u_dist, u_first, u_second);
+ read_dist_16(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ uint16x8_t u_reg, v_reg;
+ uint16x8x2_t pair;
+
+ read_dist_8(u_dist, &u_reg);
+
+ pair = vzipq_u16(u_reg, u_reg);
+ *u_first = pair.val[0];
+ *u_second = pair.val[1];
+
+ read_dist_8(v_dist, &v_reg);
+
+ pair = vzipq_u16(v_reg, v_reg);
+ *v_first = pair.val[0];
+ *v_second = pair.val[1];
+ }
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+ int ss_x, int ss_y,
+ uint16x8_t *u_mod,
+ uint16x8_t *v_mod) {
+ uint16x8_t y_reg;
+ if (!ss_x) {
+ read_dist_8(y_dist, &y_reg);
+ if (ss_y == 1) {
+ uint16x8_t y_tmp;
+ read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+ y_reg = vqaddq_u16(y_reg, y_tmp);
+ }
+ } else {
+ uint16x8_t y_first, y_second;
+ uint32x4_t y_first32, y_second32;
+
+ read_dist_16(y_dist, &y_first, &y_second);
+ if (ss_y == 1) {
+ uint16x8_t y_tmp_0, y_tmp_1;
+ read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+ y_first = vqaddq_u16(y_first, y_tmp_0);
+ y_second = vqaddq_u16(y_second, y_tmp_1);
+ }
+
+ y_first32 = vpaddlq_u16(y_first);
+ y_second32 = vpaddlq_u16(y_second);
+
+ y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32));
+ }
+
+ *u_mod = vqaddq_u16(*u_mod, y_reg);
+ *v_mod = vqaddq_u16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+ const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors_first,
+ const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ uint16x8_t weight_first, weight_second;
+
+ uint16x8_t mul_first, mul_second;
+
+ uint16x8_t sum_row_1_first, sum_row_1_second;
+ uint16x8_t sum_row_2_first, sum_row_2_second;
+ uint16x8_t sum_row_3_first, sum_row_3_second;
+
+ uint16x8_t u_first, u_second;
+ uint16x8_t v_first, v_second;
+
+ uint16x8_t sum_row_first;
+ uint16x8_t sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(block_width == 16);
+ (void)block_width;
+
+ // Initialize the weights
+ if (blk_fw) {
+ weight_first = vdupq_n_u16(blk_fw[0]);
+ weight_second = vdupq_n_u16(blk_fw[1]);
+ } else {
+ weight_first = vdupq_n_u16(top_weight);
+ weight_second = weight_first;
+ }
+
+ // First row
+ mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+ mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+ // Add luma values
+ get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first);
+ sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+
+ sum_row_first = vqaddq_u16(sum_row_first, u_first);
+ sum_row_second = vqaddq_u16(sum_row_second, u_second);
+
+ sum_row_first = vqaddq_u16(sum_row_first, v_first);
+ sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]);
+ mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ if (blk_fw) {
+ weight_first = vdupq_n_u16(blk_fw[2]);
+ weight_second = vdupq_n_u16(blk_fw[3]);
+ } else {
+ weight_first = vdupq_n_u16(bottom_weight);
+ weight_second = weight_first;
+ }
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first);
+ sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = vqaddq_u16(sum_row_first, u_first);
+ sum_row_second = vqaddq_u16(sum_row_second, u_second);
+ sum_row_first = vqaddq_u16(sum_row_first, v_first);
+ sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+ mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+ }
+
+ sum_row_first = vqaddq_u16(sum_row_first, u_first);
+ sum_row_second = vqaddq_u16(sum_row_second, u_second);
+ sum_row_first = vqaddq_u16(sum_row_first, v_first);
+ sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+ const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors_first;
+ const int16_t *const *neighbors_second;
+
+ if (block_width == 16) {
+ // Special Case: The block width is 16 and we are operating on a row of 16
+ // chroma pixels. In this case, we can't use the usual left-middle-right
+ // pattern. We also don't support splitting now.
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ if (use_whole_blk) {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ } else {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+ const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+
+ uint16x8_t weight;
+
+ uint16x8_t mul;
+
+ uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3;
+ uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+ uint16x8_t u_sum_row, v_sum_row;
+
+ // Loop variable
+ unsigned int h;
+
+ // Initialize weight
+ if (blk_fw) {
+ weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1]));
+ } else {
+ weight = vdupq_n_u16(top_weight);
+ }
+
+ // First row
+ mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+ // Add chroma values
+ get_sum_8(u_dist, &u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+ u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3);
+
+ get_sum_8(v_dist, &v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+ v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul = vld1q_u16((const uint16_t *)neighbors[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3]));
+ } else {
+ weight = vdupq_n_u16(bottom_weight);
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+ u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3);
+
+ v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+ v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+ v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+ const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+ ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ } else {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+ ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ }
+
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_neon(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+ const int *blk_fw_ptr = blk_fw;
+
+ uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+ store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+ ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk,
+ y_accum, y_count, y_dist_ptr, u_dist_ptr,
+ v_dist_ptr);
+
+ apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw_ptr,
+ use_whole_blk, u_accum, u_count, v_accum,
+ v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index bd3812036..ef3423f8e 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
&cpi->rc, cm->frame_type, cm->base_qindex,
aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
- // For AQ complexity mode, we dont allow Q0 in a segment if the base
+ // For AQ complexity mode, we don't allow Q0 in a segment if the base
// Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
// Q delta is sometimes applied without going back around the rd loop.
// This could lead to an illegal combination of partition size and q.
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a84c8b524..ca56d14aa 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -169,8 +169,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
vpx_write_bit(w, p->extra & 1);
} else { // t >= TWO_TOKEN && t < EOB_TOKEN
const struct vp9_token *const a = &vp9_coef_encodings[t];
- const int v = a->value;
- const int n = a->len;
+ int v = a->value;
+ int n = a->len;
const int e = p->extra;
vpx_write(w, 1, context_tree[2]);
vp9_write_tree(w, vp9_coef_con_tree,
@@ -179,8 +179,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
if (t >= CATEGORY1_TOKEN) {
const vp9_extra_bit *const b = &extra_bits[t];
const unsigned char *pb = b->prob;
- int v = e >> 1;
- int n = b->len; // number of bits in v, assumed nonzero
+ v = e >> 1;
+ n = b->len; // number of bits in v, assumed nonzero
do {
const int bb = (v >> --n) & 1;
vpx_write(w, bb, *pb++);
@@ -599,7 +599,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
for (t = 0; t < entropy_nodes_update; ++t) {
vpx_prob newp = new_coef_probs[i][j][k][l][t];
vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
- const vpx_prob upd = DIFF_UPDATE_PROB;
int64_t s;
int u = 0;
if (t == PIVOT_NODE)
@@ -968,13 +967,13 @@ static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
int i;
const size_t worker_data_size =
cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
- CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+ CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
vpx_memalign(16, worker_data_size));
memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
for (i = 1; i < cpi->num_workers; ++i) {
cpi->vp9_bitstream_worker_data[i].dest_size =
cpi->oxcf.width * cpi->oxcf.height;
- CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+ CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
}
}
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 20294b4b9..7fa00cd19 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
#include "vpx_util/vpx_thread.h"
+#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
@@ -24,7 +25,7 @@ typedef struct {
unsigned int sse;
int sum;
unsigned int var;
-} diff;
+} Diff;
struct macroblock_plane {
DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
@@ -33,8 +34,8 @@ struct macroblock_plane {
uint16_t *eobs;
struct buf_2d src;
- // Quantizer setings
- DECLARE_ALIGNED(16, int16_t, round_fp[8]);
+ // Quantizer settings
+ int16_t *round_fp;
int16_t *quant_fp;
int16_t *quant;
int16_t *quant_shift;
@@ -78,16 +79,16 @@ struct macroblock {
int skip_recode;
int skip_optimize;
int q_index;
- int block_qcoeff_opt;
+ double log_block_src_var;
int block_tx_domain;
// The equivalent error at the current rdmult of one whole bit (not one
// bitcost unit).
int errorperbit;
- // The equivalend SAD error of one (whole) bit at the current quantizer
+ // The equivalent SAD error of one (whole) bit at the current quantizer
// for large blocks.
int sadperbit16;
- // The equivalend SAD error of one (whole) bit at the current quantizer
+ // The equivalent SAD error of one (whole) bit at the current quantizer
// for sub-8x8 blocks.
int sadperbit4;
int rddiv;
@@ -127,7 +128,7 @@ struct macroblock {
// from extending outside the UMV borders
MvLimits mv_limits;
- // Notes transform blocks where no coefficents are coded.
+ // Notes transform blocks where no coefficients are coded.
// Set during mode selection. Read during block encoding.
uint8_t zcoeff_blk[TX_SIZES][256];
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index b74b9027c..42073f756 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -25,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
int i, k;
ctx->num_4x4_blk = num_blk;
- CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+ CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+ vpx_calloc(num_blk, sizeof(uint8_t)));
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
- CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
- CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
- CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
- CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+ CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -100,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
int nodes;
vpx_free(td->leaf_tree);
- CHECK_MEM_ERROR(cm, td->leaf_tree,
+ CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
vpx_free(td->pc_tree);
- CHECK_MEM_ERROR(cm, td->pc_tree,
+ CHECK_MEM_ERROR(&cm->error, td->pc_tree,
vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
this_pc = &td->pc_tree[0];
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 77d72396a..e5dffa90a 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -319,7 +319,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
filter_mbd->plane[2].dst.stride =
denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
- set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+ set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME);
vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
// Restore everything to its original state
@@ -387,7 +387,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
// No need to keep checking 8x8 blocks if any of the sub-blocks
// has small consec_zeromv (since threshold for no_skin based on
- // zero/small motion in skin detection is high, i.e, > 4).
+ // zero/small motion in skin detection is high, i.e., > 4).
if (consec_zeromv < 4) {
i = ymis;
break;
@@ -634,11 +634,11 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
denoiser->num_layers = num_layers;
- CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+ CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
vpx_calloc(denoiser->num_ref_frames * num_layers,
sizeof(denoiser->running_avg_y[0])));
CHECK_MEM_ERROR(
- cm, denoiser->mc_running_avg_y,
+ &cm->error, denoiser->mc_running_avg_y,
vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
for (layer = 0; layer < num_layers; ++layer) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1483ac069..7ff5f00ed 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -349,17 +349,17 @@ typedef struct {
int32_t sum_error;
int log2_count;
int variance;
-} var;
+} Var;
typedef struct {
- var none;
- var horz[2];
- var vert[2];
+ Var none;
+ Var horz[2];
+ Var vert[2];
} partition_variance;
typedef struct {
partition_variance part_variances;
- var split[4];
+ Var split[4];
} v4x4;
typedef struct {
@@ -384,7 +384,7 @@ typedef struct {
typedef struct {
partition_variance *part_variances;
- var *split[4];
+ Var *split[4];
} variance_node;
typedef enum {
@@ -436,13 +436,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
}
// Set variance values given sum square error, sum error, count.
-static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) {
v->sum_square_error = s2;
v->sum_error = s;
v->log2_count = c;
}
-static void get_variance(var *v) {
+static void get_variance(Var *v) {
v->variance =
(int)(256 * (v->sum_square_error -
(uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -450,7 +450,7 @@ static void get_variance(var *v) {
v->log2_count);
}
-static void sum_2_variances(const var *a, const var *b, var *r) {
+static void sum_2_variances(const Var *a, const Var *b, Var *r) {
assert(a->log2_count == b->log2_count);
fill_variance(a->sum_square_error + b->sum_square_error,
a->sum_error + b->sum_error, a->log2_count + 1, r);
@@ -1301,6 +1301,13 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
(frame_is_intra_only(cm) ||
(is_one_pass_svc(cpi) &&
cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+ if (!is_key_frame) {
+ if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+ cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+ is_key_frame = 1;
+ }
+
// Always use 4x4 partition for key frame.
const int use_4x4_partition = frame_is_intra_only(cm);
const int low_res = (cm->width <= 352 && cm->height <= 288);
@@ -1437,7 +1444,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
&cm->frame_refs[LAST_FRAME - 1].sf);
mi->ref_frame[0] = LAST_FRAME;
}
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->sb_type = BLOCK_64X64;
mi->mv[0].as_int = 0;
mi->interp_filter = BILINEAR;
@@ -1545,7 +1552,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
}
if (low_res && threshold_4x4avg < INT64_MAX)
- CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
+ CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
// Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
// for splits.
for (i = 0; i < 4; i++) {
@@ -1706,7 +1713,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
const int y16_idx = ((j >> 1) << 1);
// For inter frames: if variance4x4downsample[] == 1 for this 16x16
// block, then the variance is based on 4x4 down-sampling, so use vt2
- // in set_vt_partioning(), otherwise use vt.
+ // in set_vt_partitioning(), otherwise use vt.
v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
? &vt2[i2 + j]
: &vt.split[i].split[j];
@@ -1863,8 +1870,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
vp9_update_mv_count(td);
if (cm->interp_filter == SWITCHABLE) {
- const int ctx = get_pred_context_switchable_interp(xd);
- ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
+ const int ctx_interp = get_pred_context_switchable_interp(xd);
+ ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter];
}
}
@@ -1924,7 +1931,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
mi->skip = 1;
mi->uv_mode = DC_PRED;
mi->ref_frame[0] = LAST_FRAME;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->mv[0].as_int = 0;
mi->interp_filter = filter_ref;
@@ -1980,6 +1987,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
int64_t best_rd = INT64_MAX;
vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_sb_modes_time);
+#endif
// Use the lower precision, but faster, 32x32 fdct for mode selection.
x->use_lp32x32fdct = 1;
@@ -2018,20 +2028,20 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
// Save rdmult before it might be changed, so it can be restored later.
orig_rdmult = x->rdmult;
- if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) {
+ if ((cpi->sf.tx_domain_thresh > 0.0) ||
+ (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) {
double logvar = vp9_log_block_var(cpi, x, bsize);
- // Check block complexity as part of descision on using pixel or transform
+ // Check block complexity as part of decision on using pixel or transform
// domain distortion in rd tests.
x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion &&
(logvar >= cpi->sf.tx_domain_thresh);
- // Check block complexity as part of descision on using quantized
- // coefficient optimisation inside the rd loop.
- x->block_qcoeff_opt =
- cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh);
+ // Store block complexity to decide on using quantized coefficient
+ // optimization inside the rd loop.
+ x->log_block_src_var = logvar;
} else {
x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion;
- x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt;
+ x->log_block_src_var = 0.0;
}
set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
@@ -2047,15 +2057,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
} else {
if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
ctx, best_rd);
else
vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
} else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
}
}
@@ -2078,6 +2100,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
ctx->rate = rd_cost->rate;
ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_sb_modes_time);
+#endif
}
#endif // !CONFIG_REALTIME_ONLY
@@ -2414,16 +2439,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
(row8x8_remaining >= MI_BLOCK_SIZE)) {
int i, j;
int index;
- diff d32[4];
+ Diff d32[4];
const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
int is_larger_better = 0;
int use32x32 = 0;
unsigned int thr = cpi->source_var_thresh;
- memset(d32, 0, 4 * sizeof(diff));
+ memset(d32, 0, sizeof(d32));
for (i = 0; i < 4; i++) {
- diff *d16[4];
+ Diff *d16[4];
for (j = 0; j < 4; j++) {
int b_mi_row = coord_lookup[i * 4 + j].row;
@@ -2730,10 +2755,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_row + (mi_step >> 1) < cm->mi_rows) {
RD_COST tmp_rdc;
- PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+ PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
vp9_rd_cost_init(&tmp_rdc);
- update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
pc_tree->horizontal[1].skip_ref_frame_mask = 0;
rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
&tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
@@ -2754,10 +2779,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_col + (mi_step >> 1) < cm->mi_cols) {
RD_COST tmp_rdc;
- PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+ PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0];
vp9_rd_cost_init(&tmp_rdc);
- update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx);
pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
rd_pick_sb_modes(
cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
@@ -2829,8 +2854,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
int x_idx = (i & 1) * (mi_step >> 1);
int y_idx = (i >> 1) * (mi_step >> 1);
RD_COST tmp_rdc;
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
- PARTITION_CONTEXT sl[8], sa[8];
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
@@ -3036,14 +3059,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
min_size = BLOCK_64X64;
max_size = BLOCK_4X4;
- if (prev_mi) {
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- mi = prev_mi[idy * cm->mi_stride + idx];
- bs = mi ? mi->sb_type : bsize;
- min_size = VPXMIN(min_size, bs);
- max_size = VPXMAX(max_size, bs);
- }
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = prev_mi[idy * cm->mi_stride + idx];
+ bs = mi ? mi->sb_type : bsize;
+ min_size = VPXMIN(min_size, bs);
+ max_size = VPXMAX(max_size, bs);
}
}
@@ -3189,7 +3210,7 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
left_par = 1;
}
- if (prev_mi) {
+ if (prev_mi[0]) {
context_size = prev_mi[0]->sb_type;
if (context_size < bsize)
last_par = 2;
@@ -3422,18 +3443,23 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
MV best_mv = { 0, 0 };
int cost_list[5];
+ struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } };
- if (scaled_ref_frame)
+ if (scaled_ref_frame) {
yv12 = scaled_ref_frame;
- else
+ // As reported in b/311294795, the reference buffer pointer needs to be
+ // saved and restored after the search. Otherwise, it causes problems while
+ // the reference frame scaling happens.
+ for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0];
+ } else {
yv12 = get_ref_frame_buffer(cpi, ref);
+ }
assert(yv12 != NULL);
if (!yv12) return;
- vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
- &cm->frame_refs[ref - 1].sf);
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL);
mi->ref_frame[0] = ref;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->sb_type = bsize;
vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
@@ -3444,6 +3470,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
x->mv_limits = tmp_mv_limits;
mi->mv[0].as_mv = best_mv;
+ // Restore reference buffer pointer.
+ if (scaled_ref_frame) {
+ for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i];
+ }
+
set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
xd->plane[0].dst.buf = pred_buf;
xd->plane[0].dst.stride = 64;
@@ -3454,15 +3485,15 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
// Features used: QP; spatial block size contexts; variance of prediction
// residue after simple_motion_search.
#define FEATURES 12
-static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
- MACROBLOCK *const x,
- PC_TREE *const pc_tree,
- BLOCK_SIZE bsize, int mi_row,
- int mi_col, int *none, int *split) {
+static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi,
+ MACROBLOCK *const x,
+ PC_TREE *const pc_tree,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int *none, int *split) {
const VP9_COMMON *const cm = &cpi->common;
const NN_CONFIG *nn_config = NULL;
+ const MACROBLOCKD *const xd = &x->e_mbd;
#if CONFIG_VP9_HIGHBITDEPTH
- MACROBLOCKD *xd = &x->e_mbd;
DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
? (CONVERT_TO_BYTEPTR(pred_buffer))
@@ -3545,7 +3576,6 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
const unsigned int var =
cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
- const MACROBLOCKD *const xd = &x->e_mbd;
const int has_above = !!xd->above_mi;
const int has_left = !!xd->left_mi;
const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
@@ -3695,7 +3725,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
int row, col;
int dr = 0;
- int count = 0;
double r0, rk, beta;
TplDepFrame *tpl_frame;
@@ -3719,8 +3748,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
intra_cost += this_stats->intra_cost;
mc_dep_cost += this_stats->mc_dep_cost;
-
- ++count;
}
}
@@ -3777,7 +3804,7 @@ static void assign_motion_vector_info(const int block_width_4x4,
const int col_4x4 = col_start_4x4 + j;
const int unit_index = row_4x4 * num_unit_cols + col_4x4;
if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue;
- if (source_ref_frame[1] == NONE) {
+ if (source_ref_frame[1] == NO_REF_FRAME) {
assert(source_mv[1]->row == 0 && source_mv[1]->col == 0);
}
motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0];
@@ -4080,8 +4107,8 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
if (do_rd_ml_partition_var_pruning) {
- ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
- &partition_none_allowed, &do_split);
+ ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+ &partition_none_allowed, &do_split);
} else {
vp9_zero(pc_tree->mv);
}
@@ -4330,9 +4357,9 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
bsize > BLOCK_8X8) {
- PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
- update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
- encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+ update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
partition_none_allowed)
pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
@@ -4407,12 +4434,31 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
}
+ if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) {
+ vp9_rd_cost_reset(&this_rdc);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64,
+ ctx, INT_MAX, INT64_MAX);
+ ctx->rdcost = this_rdc.rdcost;
+ vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ should_encode_sb = 1;
+ pc_tree->partitioning = PARTITION_NONE;
+ }
+ }
+
*rd_cost = best_rdc;
if (should_encode_sb && pc_tree->index != 3) {
int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
#if CONFIG_RATE_CTRL
if (oxcf->use_simple_encode_api) {
// Store partition, motion vector of the superblock.
@@ -4539,8 +4585,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
&x->min_partition_size, &x->max_partition_size);
}
td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_partition_time);
+#endif
rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_partition_time);
+#endif
}
(*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
sb_col_in_tile, num_sb_cols);
@@ -4672,6 +4725,8 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+ x->skip_recode = 0;
+
mi = xd->mi[0];
mi->sb_type = bsize;
@@ -4795,9 +4850,9 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
#define FEATURES 6
#define LABELS 2
-static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int mi_row,
- int mi_col) {
+static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
VP9_COMMON *const cm = &cpi->common;
const NN_CONFIG *nn_config = NULL;
@@ -4929,7 +4984,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (partition_none_allowed || do_split) do_rect = 0;
if (partition_none_allowed && do_split) {
const int ml_predicted_partition =
- ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+ ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
}
@@ -5418,7 +5473,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
&cm->frame_refs[LAST_FRAME - 1].sf);
mi->ref_frame[0] = LAST_FRAME;
}
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->sb_type = BLOCK_64X64;
mi->mv[0].as_int = 0;
mi->interp_filter = BILINEAR;
@@ -5608,7 +5663,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
- // Use lower max_partition_size for low resoultions.
+ // Use lower max_partition_size for low resolutions.
if (cm->width <= 352 && cm->height <= 288)
x->max_partition_size = BLOCK_32X32;
else
@@ -5650,12 +5705,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
}
// end RTC play code
-static INLINE uint32_t variance(const diff *const d) {
+static INLINE uint32_t variance(const Diff *const d) {
return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
}
#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint32_t variance_highbd(diff *const d) {
+static INLINE uint32_t variance_highbd(Diff *const d) {
const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
return (var >= 0) ? (uint32_t)var : 0;
}
@@ -5675,7 +5730,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100)
: (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
- diff *var16 = cpi->source_diff_var;
+ Diff *var16 = cpi->source_diff_var;
int sum = 0;
int i, j;
@@ -5758,8 +5813,8 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
if (cm->last_width != cm->width || cm->last_height != cm->height) {
if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
- CHECK_MEM_ERROR(cm, cpi->source_diff_var,
- vpx_calloc(cm->MBs, sizeof(diff)));
+ CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+ vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
}
if (!cpi->frames_till_next_var_check)
@@ -5798,7 +5853,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
CHECK_MEM_ERROR(
- cm, cpi->tile_data,
+ &cm->error, cpi->tile_data,
vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
cpi->allocated_tiles = tile_cols * tile_rows;
@@ -5807,20 +5862,15 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
TileDataEnc *tile_data =
&cpi->tile_data[tile_row * tile_cols + tile_col];
int i, j;
+ const MV zero_mv = { 0, 0 };
for (i = 0; i < BLOCK_SIZES; ++i) {
for (j = 0; j < MAX_MODES; ++j) {
tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_RATE_CTRL
- if (cpi->oxcf.use_simple_encode_api) {
- tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
- }
-#endif // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif // CONFIG_CONSISTENT_RECODE
tile_data->mode_map[i][j] = j;
}
}
+ tile_data->firstpass_top_mv = zero_mv;
#if CONFIG_MULTITHREAD
tile_data->row_base_thresh_freq_fact = NULL;
#endif
@@ -6037,9 +6087,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
#endif // CONFIG_VP9_HIGHBITDEPTH
x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-#if CONFIG_CONSISTENT_RECODE
x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
-#endif
if (xd->lossless) x->optimize = 0;
x->sharpness = cpi->oxcf.sharpness;
x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
@@ -6108,6 +6156,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
}
+ for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+ ++ref_frame) {
+ if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+ if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+ cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+ cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame);
+ }
+ }
+
// Frame segmentation
if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
@@ -6166,7 +6223,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
int mi_row, mi_col;
int sum_delta = 0;
- int map_index = 0;
int qdelta_index;
int segment_id;
@@ -6176,7 +6232,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
segment_id = mi_8x8[0]->segment_id;
qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
sum_delta += qdelta_index;
- map_index++;
}
mi_8x8_ptr += cm->mi_stride;
}
@@ -6184,13 +6239,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
return sum_delta / (cm->mi_rows * cm->mi_cols);
}
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
static void restore_encode_params(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- const int tile_cols = 1 << cm->log2_tile_cols;
- const int tile_rows = 1 << cm->log2_tile_rows;
- int tile_col, tile_row;
+ int tile_idx;
int i, j;
+ TileDataEnc *tile_data;
RD_OPT *rd_opt = &cpi->rd;
for (i = 0; i < MAX_REF_FRAMES; i++) {
for (j = 0; j < REFERENCE_MODES; j++)
@@ -6201,35 +6254,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
}
- if (cpi->tile_data != NULL) {
- for (tile_row = 0; tile_row < tile_rows; ++tile_row)
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- TileDataEnc *tile_data =
- &cpi->tile_data[tile_row * tile_cols + tile_col];
- for (i = 0; i < BLOCK_SIZES; ++i) {
- for (j = 0; j < MAX_MODES; ++j) {
- tile_data->thresh_freq_fact[i][j] =
- tile_data->thresh_freq_fact_prev[i][j];
- }
- }
- }
+ for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+ assert(cpi->tile_data);
+ tile_data = &cpi->tile_data[tile_idx];
+ vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
}
cm->interp_filter = cpi->sf.default_interp_filter;
}
-#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
void vp9_encode_frame(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
-#if CONFIG_RATE_CTRL
- if (cpi->oxcf.use_simple_encode_api) {
- restore_encode_params(cpi);
- }
-#endif // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
restore_encode_params(cpi);
-#endif
#if CONFIG_MISMATCH_DEBUG
mismatch_reset_frame(MAX_MB_PLANE);
@@ -6283,7 +6320,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
if (cm->interp_filter == SWITCHABLE)
cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_frame_internal_time);
+#endif
encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_frame_internal_time);
+#endif
for (i = 0; i < REFERENCE_MODES; ++i)
mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dc..eded9f5c4 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -26,6 +26,7 @@
#include "vp9/common/vp9_scan.h"
#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_tokenize.h"
@@ -78,7 +79,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
const int shift = (tx_size == TX_32X32);
const int16_t *const dequant_ptr = pd->dequant;
const uint8_t *const band_translate = get_band_translate(tx_size);
- const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
+ const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block);
const int16_t *const scan = so->scan;
const int16_t *const nb = so->neighbors;
const MODE_INFO *mbmi = xd->mi[0];
@@ -350,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -366,28 +367,24 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
switch (tx_size) {
case TX_32X32:
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp,
- qcoeff, dqcoeff, pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
+ vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff,
+ pd->dequant, eob, scan_order);
break;
case TX_16X16:
vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
- vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_8X8:
vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
- vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
default:
assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
}
return;
@@ -397,26 +394,25 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
switch (tx_size) {
case TX_32X32:
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_16X16:
vpx_fdct16x16(src_diff, coeff, diff_stride);
- vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
- pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_8X8:
vpx_fdct8x8(src_diff, coeff, diff_stride);
- vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
- pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
default:
assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
- pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
}
}
@@ -495,7 +491,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -511,28 +507,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
switch (tx_size) {
case TX_32X32:
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vpx_highbd_quantize_b_32x32(
- coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_16X16:
vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_8X8:
vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
default:
assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
}
return;
@@ -542,28 +534,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
switch (tx_size) {
case TX_32X32:
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
+ vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_16X16:
vpx_fdct16x16(src_diff, coeff, diff_stride);
- vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_8X8:
vpx_fdct8x8(src_diff, coeff, diff_stride);
- vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
default:
assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
}
}
@@ -759,10 +747,23 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
MODE_INFO *mi = xd->mi[0];
int plane;
#if CONFIG_MISMATCH_DEBUG
- struct encode_b_args arg = { x, 1, NULL, NULL,
+ struct encode_b_args arg = { x,
+ 1, // enable_trellis_opt
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ NULL, // above entropy context
+ NULL, // left entropy context
&mi->skip, mi_row, mi_col, output_enabled };
#else
- struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
+ struct encode_b_args arg = { x,
+ 1, // enable_trellis_opt
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ NULL, // above entropy context
+ NULL, // left entropy context
+ &mi->skip };
(void)mi_row;
(void)mi_col;
(void)output_enabled;
@@ -780,9 +781,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
ctx.tl[plane]);
- arg.enable_coeff_opt = 1;
+ arg.enable_trellis_opt = 1;
} else {
- arg.enable_coeff_opt = 0;
+ arg.enable_trellis_opt = 0;
}
arg.ta = ctx.ta[plane];
arg.tl = ctx.tl[plane];
@@ -804,7 +805,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- const scan_order *scan_order;
+ const ScanOrder *scan_order;
TX_TYPE tx_type = DCT_DCT;
PREDICTION_MODE mode;
const int bwl = b_width_log2_lookup[plane_bsize];
@@ -814,17 +815,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
uint16_t *eob = &p->eobs[block];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
+ int enable_trellis_opt = !x->skip_recode;
ENTROPY_CONTEXT *a = NULL;
ENTROPY_CONTEXT *l = NULL;
int entropy_ctx = 0;
dst = &pd->dst.buf[4 * (row * dst_stride + col)];
src = &p->src.buf[4 * (row * src_stride + col)];
src_diff = &p->src_diff[4 * (row * diff_stride + col)];
- if (args->enable_coeff_opt) {
- a = &args->ta[col];
- l = &args->tl[row];
- entropy_ctx = combine_entropy_contexts(*a, *l);
- }
if (tx_size == TX_4X4) {
tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
@@ -848,20 +845,42 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
// skip block condition should be handled before this is called.
assert(!x->skip_block);
+ if (!x->skip_recode) {
+ const int tx_size_in_pixels = (1 << tx_size) << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+ diff_stride, src, src_stride, dst, dst_stride,
+ xd->bd);
+ } else {
+ vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+ diff_stride, src, src_stride, dst, dst_stride);
+ }
+#else
+ vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+ diff_stride, src, src_stride, dst, dst_stride);
+#endif
+ enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+ plane_bsize, tx_size, args);
+ }
+
+ if (enable_trellis_opt) {
+ a = &args->ta[col];
+ l = &args->tl[row];
+ entropy_ctx = combine_entropy_contexts(*a, *l);
+ }
+
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
if (!x->skip_recode) {
- vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
- src_stride, dst, dst_stride, xd->bd);
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vpx_highbd_quantize_b_32x32(
- coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob) {
@@ -870,17 +889,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
break;
case TX_16X16:
if (!x->skip_recode) {
- vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src,
- src_stride, dst, dst_stride, xd->bd);
if (tx_type == DCT_DCT)
vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
else
vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
- vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant,
- eob, scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob) {
@@ -890,17 +906,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
break;
case TX_8X8:
if (!x->skip_recode) {
- vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src,
- src_stride, dst, dst_stride, xd->bd);
if (tx_type == DCT_DCT)
vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
else
vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
- vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant,
- eob, scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob) {
@@ -911,17 +924,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
default:
assert(tx_size == TX_4X4);
if (!x->skip_recode) {
- vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
- src_stride, dst, dst_stride, xd->bd);
if (tx_type != DCT_DCT)
vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
else
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant,
- eob, scan_order->scan, scan_order->iscan);
+ vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob) {
@@ -945,14 +955,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
switch (tx_size) {
case TX_32X32:
if (!x->skip_recode) {
- vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
- dst_stride);
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
+ vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob)
@@ -960,14 +967,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
break;
case TX_16X16:
if (!x->skip_recode) {
- vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
- dst_stride);
vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
- vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob)
@@ -975,14 +979,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
break;
case TX_8X8:
if (!x->skip_recode) {
- vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
- dst_stride);
vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
- vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob)
@@ -991,17 +992,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
default:
assert(tx_size == TX_4X4);
if (!x->skip_recode) {
- vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
- dst_stride);
if (tx_type != DCT_DCT)
vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
else
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
}
- if (args->enable_coeff_opt && !x->skip_recode) {
+ if (enable_trellis_opt) {
*a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
}
if (!x->skip_encode && *eob) {
@@ -1019,28 +1017,43 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
}
void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
- int enable_optimize_b) {
+ int enable_trellis_opt) {
const MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
#if CONFIG_MISMATCH_DEBUG
// TODO(angiebird): make mismatch_debug support intra mode
struct encode_b_args arg = {
- x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0,
- 0
+ x,
+ enable_trellis_opt,
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ ctx.ta[plane],
+ ctx.tl[plane],
+ &xd->mi[0]->skip,
+ 0, // mi_row
+ 0, // mi_col
+ 0 // output_enabled
};
#else
- struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane],
- ctx.tl[plane], &xd->mi[0]->skip };
+ struct encode_b_args arg = { x,
+ enable_trellis_opt,
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ ctx.ta[plane],
+ ctx.tl[plane],
+ &xd->mi[0]->skip };
#endif
- if (enable_optimize_b && x->optimize &&
+ if (enable_trellis_opt && x->optimize &&
(!x->skip_recode || !x->skip_optimize)) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
const TX_SIZE tx_size =
plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size;
vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
} else {
- arg.enable_coeff_opt = 0;
+ arg.enable_trellis_opt = 0;
}
vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 1975ee73a..1391446be 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -20,7 +20,10 @@ extern "C" {
struct encode_b_args {
MACROBLOCK *x;
- int enable_coeff_opt;
+ int enable_trellis_opt;
+ double trellis_opt_thresh;
+ int *sse_calc_done;
+ int64_t *sse;
ENTROPY_CONTEXT *ta;
ENTROPY_CONTEXT *tl;
int8_t *skip;
@@ -48,7 +51,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
- int enable_optimize_b);
+ int enable_trellis_opt);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e38507754..152d42bc9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -12,6 +12,7 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
@@ -23,6 +24,7 @@
#if CONFIG_INTERNAL_STATS
#include "vpx_dsp/ssim.h"
#endif
+#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/system_state.h"
#include "vpx_ports/vpx_once.h"
@@ -32,18 +34,15 @@
#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_idct.h"
-#if CONFIG_NON_GREEDY_MV
-#include "vp9/common/vp9_mvref_common.h"
-#endif
#if CONFIG_VP9_POSTPROC
#include "vp9/common/vp9_postproc.h"
#endif
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_scan.h"
#if !CONFIG_REALTIME_ONLY
#include "vp9/encoder/vp9_alt_ref_aq.h"
@@ -81,8 +80,11 @@
#include "vp9/encoder/vp9_speed_features.h"
#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_tpl_model.h"
#include "vp9/vp9_cx_iface.h"
+#include "vpx/vpx_ext_ratectrl.h"
+
#define AM_SEGMENT_ID_INACTIVE 7
#define AM_SEGMENT_ID_ACTIVE 0
@@ -126,13 +128,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
}
#endif
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
- TX_SIZE tx_size);
-#endif
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
- TX_SIZE tx_size);
-
#if !CONFIG_REALTIME_ONLY
// compute adaptive threshold for skip recoding
static int compute_context_model_thresh(const VP9_COMP *const cpi) {
@@ -148,7 +143,7 @@ static int compute_context_model_thresh(const VP9_COMP *const cpi) {
// frame context probability model is less than a certain threshold.
// The first component is the most critical part to guarantee adaptivity.
// Other parameters are estimated based on normal setting of hd resolution
- // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+ // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
const int thresh =
((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
qindex_factor) >>
@@ -502,22 +497,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
"Too many reference buffers are used."
};
-static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) {
switch (mode) {
- case NORMAL:
+ case VP8E_NORMAL:
*hr = 1;
*hs = 1;
break;
- case FOURFIVE:
+ case VP8E_FOURFIVE:
*hr = 4;
*hs = 5;
break;
- case THREEFIVE:
+ case VP8E_THREEFIVE:
*hr = 3;
*hs = 5;
break;
default:
- assert(mode == ONETWO);
+ assert(mode == VP8E_ONETWO);
*hr = 1;
*hs = 2;
break;
@@ -690,9 +685,10 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
}
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[8], int delta_lf[8],
- int skip[8], int ref_frame[8]) {
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[8], int delta_lf[8], int skip[8],
+ int ref_frame[8]) {
VP9_COMMON *cm = &cpi->common;
vpx_roi_map_t *roi = &cpi->roi;
const int range = 63;
@@ -703,13 +699,13 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
// Check number of rows and columns match
if (frame_rows != (int)rows || frame_cols != (int)cols) {
- return -1;
+ return VPX_CODEC_INVALID_PARAM;
}
if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
!check_seg_range(ref_frame, ref_frame_range) ||
!check_seg_range(skip, skip_range))
- return -1;
+ return VPX_CODEC_INVALID_PARAM;
// Also disable segmentation if no deltas are specified.
if (!map ||
@@ -723,14 +719,15 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
ref_frame[6] == -1 && ref_frame[7] == -1))) {
vp9_disable_segmentation(&cm->seg);
cpi->roi.enabled = 0;
- return 0;
+ return VPX_CODEC_OK;
}
if (roi->roi_map) {
vpx_free(roi->roi_map);
roi->roi_map = NULL;
}
- CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+ roi->roi_map = vpx_malloc(rows * cols);
+ if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
// Copy to ROI structure in the compressor.
memcpy(roi->roi_map, map, rows * cols);
@@ -742,7 +739,7 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
roi->rows = rows;
roi->cols = cols;
- return 0;
+ return VPX_CODEC_OK;
}
int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
@@ -886,10 +883,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
if (!cm->prev_mip) return 1;
cm->mi_alloc_size = mi_size;
- cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+ cm->mi_grid_base =
+ (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
if (!cm->mi_grid_base) return 1;
cm->prev_mi_grid_base =
- (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+ (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
if (!cm->prev_mi_grid_base) return 1;
return 0;
@@ -1383,7 +1381,7 @@ static void alloc_context_buffers_ext(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
int mi_size = cm->mi_cols * cm->mi_rows;
- CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+ CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
}
@@ -1402,14 +1400,14 @@ static void alloc_compressor_data(VP9_COMP *cpi) {
{
unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
- CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+ CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
}
sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
vpx_free(cpi->tplist[0][0]);
CHECK_MEM_ERROR(
- cm, cpi->tplist[0][0],
+ &cm->error, cpi->tplist[0][0],
vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
vp9_setup_pc_tree(&cpi->common, &cpi->td);
@@ -1571,13 +1569,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
}
#if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdsf = SDSF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
#define MAKE_BFP_SAD_WRAPPER(fnname) \
static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
@@ -1637,284 +1637,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
}
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
static void highbd_set_var_fns(VP9_COMP *const cpi) {
VP9_COMMON *const cm = &cpi->common;
if (cm->use_highbitdepth) {
switch (cm->bit_depth) {
case VPX_BITS_8:
- HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
- vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
- vpx_highbd_8_sub_pixel_variance32x16,
- vpx_highbd_8_sub_pixel_avg_variance32x16,
- vpx_highbd_sad32x16x4d_bits8)
-
- HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
- vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
- vpx_highbd_8_sub_pixel_variance16x32,
- vpx_highbd_8_sub_pixel_avg_variance16x32,
- vpx_highbd_sad16x32x4d_bits8)
-
- HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
- vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
- vpx_highbd_8_sub_pixel_variance64x32,
- vpx_highbd_8_sub_pixel_avg_variance64x32,
- vpx_highbd_sad64x32x4d_bits8)
-
- HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
- vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
- vpx_highbd_8_sub_pixel_variance32x64,
- vpx_highbd_8_sub_pixel_avg_variance32x64,
- vpx_highbd_sad32x64x4d_bits8)
-
- HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
- vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
- vpx_highbd_8_sub_pixel_variance32x32,
- vpx_highbd_8_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x4d_bits8)
-
- HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
- vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
- vpx_highbd_8_sub_pixel_variance64x64,
- vpx_highbd_8_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x4d_bits8)
-
- HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
- vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
- vpx_highbd_8_sub_pixel_variance16x16,
- vpx_highbd_8_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x4d_bits8)
-
- HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
- vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
- vpx_highbd_8_sub_pixel_variance16x8,
- vpx_highbd_8_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x4d_bits8)
-
- HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
- vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
- vpx_highbd_8_sub_pixel_variance8x16,
- vpx_highbd_8_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x4d_bits8)
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+ vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+ vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+ vpx_highbd_8_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+ vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+ vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+ vpx_highbd_8_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+ vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+ vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+ vpx_highbd_8_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+ vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+ vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+ vpx_highbd_8_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
HIGHBD_BFP(
- BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
- vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
- vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
+ BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+ vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+ vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+ vpx_highbd_8_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
HIGHBD_BFP(
- BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
- vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
- vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
+ BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+ vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+ vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+ vpx_highbd_8_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
HIGHBD_BFP(
- BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
- vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
- vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
+ BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+ vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+ vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+ vpx_highbd_8_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
HIGHBD_BFP(
- BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
- vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
- vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
+ BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+ vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+ vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+ vpx_highbd_8_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+ vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+ vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+ vpx_highbd_8_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+ vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+ vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+ vpx_highbd_8_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+ vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+ vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+ vpx_highbd_8_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+ vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+ vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+ vpx_highbd_8_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+ vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+ vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+ vpx_highbd_8_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
break;
case VPX_BITS_10:
- HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
- vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
- vpx_highbd_10_sub_pixel_variance32x16,
- vpx_highbd_10_sub_pixel_avg_variance32x16,
- vpx_highbd_sad32x16x4d_bits10)
-
- HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
- vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
- vpx_highbd_10_sub_pixel_variance16x32,
- vpx_highbd_10_sub_pixel_avg_variance16x32,
- vpx_highbd_sad16x32x4d_bits10)
-
- HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
- vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
- vpx_highbd_10_sub_pixel_variance64x32,
- vpx_highbd_10_sub_pixel_avg_variance64x32,
- vpx_highbd_sad64x32x4d_bits10)
-
- HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
- vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
- vpx_highbd_10_sub_pixel_variance32x64,
- vpx_highbd_10_sub_pixel_avg_variance32x64,
- vpx_highbd_sad32x64x4d_bits10)
-
- HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
- vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
- vpx_highbd_10_sub_pixel_variance32x32,
- vpx_highbd_10_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x4d_bits10)
-
- HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
- vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
- vpx_highbd_10_sub_pixel_variance64x64,
- vpx_highbd_10_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x4d_bits10)
-
- HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
- vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
- vpx_highbd_10_sub_pixel_variance16x16,
- vpx_highbd_10_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x4d_bits10)
-
- HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
- vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
- vpx_highbd_10_sub_pixel_variance16x8,
- vpx_highbd_10_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x4d_bits10)
-
- HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
- vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
- vpx_highbd_10_sub_pixel_variance8x16,
- vpx_highbd_10_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x4d_bits10)
-
- HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
- vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
- vpx_highbd_10_sub_pixel_variance8x8,
- vpx_highbd_10_sub_pixel_avg_variance8x8,
- vpx_highbd_sad8x8x4d_bits10)
-
- HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
- vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
- vpx_highbd_10_sub_pixel_variance8x4,
- vpx_highbd_10_sub_pixel_avg_variance8x4,
- vpx_highbd_sad8x4x4d_bits10)
-
- HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
- vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
- vpx_highbd_10_sub_pixel_variance4x8,
- vpx_highbd_10_sub_pixel_avg_variance4x8,
- vpx_highbd_sad4x8x4d_bits10)
-
- HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
- vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
- vpx_highbd_10_sub_pixel_variance4x4,
- vpx_highbd_10_sub_pixel_avg_variance4x4,
- vpx_highbd_sad4x4x4d_bits10)
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+ vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+ vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+ vpx_highbd_10_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+ vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+ vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+ vpx_highbd_10_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+ vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+ vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+ vpx_highbd_10_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+ vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+ vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+ vpx_highbd_10_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+ vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+ vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+ vpx_highbd_10_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+ vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+ vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+ vpx_highbd_10_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+ vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+ vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+ vpx_highbd_10_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+ vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+ vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+ vpx_highbd_10_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+ vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+ vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+ vpx_highbd_10_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+ vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+ vpx_highbd_10_sub_pixel_variance8x8,
+ vpx_highbd_10_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+ vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+ vpx_highbd_10_sub_pixel_variance8x4,
+ vpx_highbd_10_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+ vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+ vpx_highbd_10_sub_pixel_variance4x8,
+ vpx_highbd_10_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+ vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+ vpx_highbd_10_sub_pixel_variance4x4,
+ vpx_highbd_10_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
break;
default:
assert(cm->bit_depth == VPX_BITS_12);
- HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
- vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
- vpx_highbd_12_sub_pixel_variance32x16,
- vpx_highbd_12_sub_pixel_avg_variance32x16,
- vpx_highbd_sad32x16x4d_bits12)
-
- HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
- vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
- vpx_highbd_12_sub_pixel_variance16x32,
- vpx_highbd_12_sub_pixel_avg_variance16x32,
- vpx_highbd_sad16x32x4d_bits12)
-
- HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
- vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
- vpx_highbd_12_sub_pixel_variance64x32,
- vpx_highbd_12_sub_pixel_avg_variance64x32,
- vpx_highbd_sad64x32x4d_bits12)
-
- HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
- vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
- vpx_highbd_12_sub_pixel_variance32x64,
- vpx_highbd_12_sub_pixel_avg_variance32x64,
- vpx_highbd_sad32x64x4d_bits12)
-
- HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
- vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
- vpx_highbd_12_sub_pixel_variance32x32,
- vpx_highbd_12_sub_pixel_avg_variance32x32,
- vpx_highbd_sad32x32x4d_bits12)
-
- HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
- vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
- vpx_highbd_12_sub_pixel_variance64x64,
- vpx_highbd_12_sub_pixel_avg_variance64x64,
- vpx_highbd_sad64x64x4d_bits12)
-
- HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
- vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
- vpx_highbd_12_sub_pixel_variance16x16,
- vpx_highbd_12_sub_pixel_avg_variance16x16,
- vpx_highbd_sad16x16x4d_bits12)
-
- HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
- vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
- vpx_highbd_12_sub_pixel_variance16x8,
- vpx_highbd_12_sub_pixel_avg_variance16x8,
- vpx_highbd_sad16x8x4d_bits12)
-
- HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
- vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
- vpx_highbd_12_sub_pixel_variance8x16,
- vpx_highbd_12_sub_pixel_avg_variance8x16,
- vpx_highbd_sad8x16x4d_bits12)
-
- HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
- vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
- vpx_highbd_12_sub_pixel_variance8x8,
- vpx_highbd_12_sub_pixel_avg_variance8x8,
- vpx_highbd_sad8x8x4d_bits12)
-
- HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
- vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
- vpx_highbd_12_sub_pixel_variance8x4,
- vpx_highbd_12_sub_pixel_avg_variance8x4,
- vpx_highbd_sad8x4x4d_bits12)
-
- HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
- vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
- vpx_highbd_12_sub_pixel_variance4x8,
- vpx_highbd_12_sub_pixel_avg_variance4x8,
- vpx_highbd_sad4x8x4d_bits12)
-
- HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
- vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
- vpx_highbd_12_sub_pixel_variance4x4,
- vpx_highbd_12_sub_pixel_avg_variance4x4,
- vpx_highbd_sad4x4x4d_bits12)
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+ vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+ vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+ vpx_highbd_12_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+ vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+ vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+ vpx_highbd_12_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+ vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+ vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+ vpx_highbd_12_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+ vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+ vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+ vpx_highbd_12_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+ vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+ vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+ vpx_highbd_12_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+ vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+ vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+ vpx_highbd_12_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+ vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+ vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+ vpx_highbd_12_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+ vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+ vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+ vpx_highbd_12_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+ vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+ vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+ vpx_highbd_12_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+ vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+ vpx_highbd_12_sub_pixel_variance8x8,
+ vpx_highbd_12_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+ vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+ vpx_highbd_12_sub_pixel_variance8x4,
+ vpx_highbd_12_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+ vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+ vpx_highbd_12_sub_pixel_variance4x8,
+ vpx_highbd_12_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+ vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+ vpx_highbd_12_sub_pixel_variance4x4,
+ vpx_highbd_12_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
break;
}
}
@@ -1926,48 +2003,48 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
// Create the encoder segmentation map and set all entries to 0
vpx_free(cpi->segmentation_map);
- CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// Create a map used for cyclic background refresh.
if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
- CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+ CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
// Create a map used to mark inactive areas.
vpx_free(cpi->active_map.map);
- CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// And a place holder structure is the coding context
// for use if we want to save and restore it
vpx_free(cpi->coding_context.last_frame_seg_map_copy);
- CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+ CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
}
static void alloc_copy_partition_data(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
if (cpi->prev_partition == NULL) {
- CHECK_MEM_ERROR(cm, cpi->prev_partition,
+ CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
(BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
sizeof(*cpi->prev_partition)));
}
if (cpi->prev_segment_id == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->prev_segment_id,
+ &cm->error, cpi->prev_segment_id,
(int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->prev_segment_id)));
}
if (cpi->prev_variance_low == NULL) {
- CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+ CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
(uint8_t *)vpx_calloc(
(cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
sizeof(*cpi->prev_variance_low)));
}
if (cpi->copied_frame_cnt == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->copied_frame_cnt,
+ &cm->error, cpi->copied_frame_cnt,
(uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->copied_frame_cnt)));
}
@@ -2085,13 +2162,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
vpx_free(cpi->consec_zero_mv);
CHECK_MEM_ERROR(
- cm, cpi->consec_zero_mv,
+ &cm->error, cpi->consec_zero_mv,
vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
vpx_free(cpi->skin_map);
CHECK_MEM_ERROR(
- cm, cpi->skin_map,
- vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+ &cm->error, cpi->skin_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
free_copy_partition_data(cpi);
alloc_copy_partition_data(cpi);
@@ -2132,18 +2209,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
vp9_set_row_mt(cpi);
}
-#ifndef M_LOG2_E
-#define M_LOG2_E 0.693147180559945309417
-#endif
-#define log2f(x) (log(x) / (float)M_LOG2_E)
-
/***********************************************************************
* Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' *
***********************************************************************
* The following 2 functions ('cal_nmvjointsadcost' and *
* 'cal_nmvsadcosts') are used to calculate cost lookup tables *
* used by 'vp9_diamond_search_sad'. The C implementation of the *
- * function is generic, but the AVX intrinsics optimised version *
+ * function is generic, but the NEON intrinsics optimised version *
* relies on the following properties of the computed tables: *
* For cal_nmvjointsadcost: *
* - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
@@ -2152,7 +2224,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
* (Equal costs for both components) *
* - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
* (Cost function is even) *
- * If these do not hold, then the AVX optimised version of the *
+ * If these do not hold, then the NEON optimised version of the *
* 'vp9_diamond_search_sad' function cannot be used as it is, in which *
* case you can revert to using the C function instead. *
***********************************************************************/
@@ -2310,7 +2382,7 @@ void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) {
VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
BufferPool *const pool) {
unsigned int i;
- VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+ VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi));
VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
if (!cm) return NULL;
@@ -2328,9 +2400,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
cm->free_mi = vp9_enc_free_mi;
cm->setup_mi = vp9_enc_setup_mi;
- CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(&cm->error, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
CHECK_MEM_ERROR(
- cm, cm->frame_contexts,
+ &cm->error, cm->frame_contexts,
(FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
cpi->compute_frame_low_motion_onepass = 1;
@@ -2357,38 +2430,38 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
realloc_segmentation_maps(cpi);
CHECK_MEM_ERROR(
- cm, cpi->skin_map,
- vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+ &cm->error, cpi->skin_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
#if !CONFIG_REALTIME_ONLY
- CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+ CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
#endif
CHECK_MEM_ERROR(
- cm, cpi->consec_zero_mv,
+ &cm->error, cpi->consec_zero_mv,
vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
- CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
i++) {
CHECK_MEM_ERROR(
- cm, cpi->mbgraph_stats[i].mb_stats,
+ &cm->error, cpi->mbgraph_stats[i].mb_stats,
vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
}
@@ -2432,7 +2505,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
}
if (cpi->b_calculate_consistency) {
- CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+ CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
sizeof(*cpi->ssim_vars) * 4));
cpi->worst_consistency = 100.0;
@@ -2503,11 +2576,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
cpi->svc.number_temporal_layers > 1) {
FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 };
- int i;
+ int n;
- for (i = 0; i < oxcf->ss_number_layers; ++i) {
+ for (n = 0; n < oxcf->ss_number_layers; ++n) {
FIRSTPASS_STATS *const last_packet_for_layer =
- &stats[packets - oxcf->ss_number_layers + i];
+ &stats[packets - oxcf->ss_number_layers + n];
const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
const int packets_in_layer = (int)last_packet_for_layer->count + 1;
if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
@@ -2517,7 +2590,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
vpx_free(lc->rc_twopass_stats_in.buf);
lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
- CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+ CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
vpx_malloc(lc->rc_twopass_stats_in.sz));
lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
lc->twopass.stats_in = lc->twopass.stats_in_start;
@@ -2532,11 +2605,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
}
}
- for (i = 0; i < packets; ++i) {
- const int layer_id = (int)stats[i].spatial_layer_id;
+ for (n = 0; n < packets; ++n) {
+ const int layer_id = (int)stats[n].spatial_layer_id;
if (layer_id >= 0 && layer_id < oxcf->ss_number_layers &&
stats_copy[layer_id] != NULL) {
- *stats_copy[layer_id] = stats[i];
+ *stats_copy[layer_id] = stats[n];
++stats_copy[layer_id];
}
}
@@ -2572,7 +2645,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
const int h = num_8x8_blocks_high_lookup[bsize];
const int num_cols = (cm->mi_cols + w - 1) / w;
const int num_rows = (cm->mi_rows + h - 1) / h;
- CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+ CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
vpx_calloc(num_rows * num_cols,
sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
}
@@ -2581,67 +2654,76 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
#if CONFIG_NON_GREEDY_MV
cpi->tpl_ready = 0;
#endif // CONFIG_NON_GREEDY_MV
- for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+ for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
+ cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+ }
// Allocate memory to store variances for a frame.
- CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
+ CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+ vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
- BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
- vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
- vpx_sad32x16x4d)
-
- BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
- vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
- vpx_sad16x32x4d)
-
- BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
- vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
- vpx_sad64x32x4d)
-
- BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
- vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
- vpx_sad32x64x4d)
-
- BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
- vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
- vpx_sad32x32x4d)
-
- BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
- vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
- vpx_sad64x64x4d)
-
- BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
- vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
- vpx_sad16x16x4d)
-
- BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
- vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
- vpx_sad16x8x4d)
-
- BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
- vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
- vpx_sad8x16x4d)
-
- BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
- vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
-
- BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
- vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
-
- BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
- vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
-
- BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
- vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdsf = SDSF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+ BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+ vpx_variance32x16, vpx_sub_pixel_variance32x16,
+ vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+ BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+ vpx_variance16x32, vpx_sub_pixel_variance16x32,
+ vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+ BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+ vpx_variance64x32, vpx_sub_pixel_variance64x32,
+ vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+ BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+ vpx_variance32x64, vpx_sub_pixel_variance32x64,
+ vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+ BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+ vpx_variance32x32, vpx_sub_pixel_variance32x32,
+ vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+ BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+ vpx_variance64x64, vpx_sub_pixel_variance64x64,
+ vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+ BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+ vpx_variance16x16, vpx_sub_pixel_variance16x16,
+ vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+ BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+ vpx_variance16x8, vpx_sub_pixel_variance16x8,
+ vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+ BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+ vpx_variance8x16, vpx_sub_pixel_variance8x16,
+ vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+ BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+ vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+ vpx_sad_skip_8x8x4d)
+
+ BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+ vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+ vpx_sad_skip_8x4x4d)
+
+ BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+ vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+ vpx_sad_skip_4x8x4d)
+
+ BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+ vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+ vpx_sad_skip_4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
@@ -2689,8 +2771,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
#endif // CONFIG_INTERNAL_STATS
-static void free_tpl_buffer(VP9_COMP *cpi);
-
void vp9_remove_compressor(VP9_COMP *cpi) {
VP9_COMMON *cm;
unsigned int i;
@@ -2784,7 +2864,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
#if 0
{
printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
- printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
+ printf("\n_frames receive_data encod_mb_row compress_frame Total\n");
printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
cpi->time_compress_data / 1000,
@@ -2804,7 +2884,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
vpx_free(cpi->kmeans_data_arr);
}
- free_tpl_buffer(cpi);
+ vp9_free_tpl_buffer(cpi);
vp9_loop_filter_dealloc(&cpi->lf_row_sync);
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
@@ -2824,6 +2904,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
vp9_extrc_delete(&cpi->ext_ratectrl);
+ // Help detect use after free of the error detail string.
+ memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1);
+ cm->error.detail[sizeof(cm->error.detail) - 1] = '\0';
+
vp9_remove_common(cm);
vp9_free_ref_frame_buffers(cm->buffer_pool);
#if CONFIG_VP9_POSTPROC
@@ -2893,7 +2977,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
- MV_REFERENCE_FRAME ref_frame = NONE;
+ MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME;
if (ref_frame_flag == VP9_LAST_FLAG)
ref_frame = LAST_FRAME;
else if (ref_frame_flag == VP9_GOLD_FLAG)
@@ -2901,7 +2985,8 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
else if (ref_frame_flag == VP9_ALT_FLAG)
ref_frame = ALTREF_FRAME;
- return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+ return ref_frame == NO_REF_FRAME ? NULL
+ : get_ref_frame_buffer(cpi, ref_frame);
}
int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
@@ -2994,12 +3079,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
#endif
#if CONFIG_VP9_HIGHBITDEPTH
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst,
- int bd) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd) {
#else
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
#endif // CONFIG_VP9_HIGHBITDEPTH
// TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
int i;
@@ -3044,6 +3128,23 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
const int dst_h = dst->y_crop_height;
+
+ // The issue b/311394513 reveals a corner case bug.
+ // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less
+ // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both
+ // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition
+ // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that
+ // supports arbitrary scaling.
+ const int x_step_q4 = 16 * src_w / dst_w;
+ const int y_step_q4 = 16 * src_h / dst_h;
+ const int is_arbitrary_scaling =
+ (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) ||
+ (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32));
+ if (is_arbitrary_scaling) {
+ vp9_scale_and_extend_frame_nonnormative(src, dst, bd);
+ return;
+ }
+
const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
src->v_buffer };
const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
@@ -3352,19 +3453,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
vpx_extend_frame_inner_borders(cm->frame_to_show);
}
-static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
- RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
- if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
- new_fb_ptr->mi_cols < cm->mi_cols) {
- vpx_free(new_fb_ptr->mvs);
- CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
- (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
- sizeof(*new_fb_ptr->mvs)));
- new_fb_ptr->mi_rows = cm->mi_rows;
- new_fb_ptr->mi_cols = cm->mi_cols;
- }
-}
-
void vp9_scale_references(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
MV_REFERENCE_FRAME ref_frame;
@@ -3711,7 +3799,7 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
case 6: l = 150; break;
}
if (!cpi->common.postproc_state.limits) {
- CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+ CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
vpx_calloc(cpi->un_scaled_source->y_width,
sizeof(*cpi->common.postproc_state.limits)));
}
@@ -3800,6 +3888,7 @@ static void set_frame_size(VP9_COMP *cpi) {
alloc_util_frame_buffers(cpi);
init_motion_estimation(cpi);
+ int has_valid_ref_frame = 0;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
@@ -3818,22 +3907,25 @@ static void set_frame_size(VP9_COMP *cpi) {
buf->y_crop_height, cm->width,
cm->height);
#endif // CONFIG_VP9_HIGHBITDEPTH
+ has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf);
if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf);
} else {
ref_buf->buf = NULL;
}
}
+ if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+ vpx_internal_error(
+ &cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Can't find at least one reference frame with valid size");
+ }
set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
}
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
static void save_encode_params(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- const int tile_cols = 1 << cm->log2_tile_cols;
- const int tile_rows = 1 << cm->log2_tile_rows;
- int tile_col, tile_row;
+ int tile_idx;
int i, j;
+ TileDataEnc *tile_data;
RD_OPT *rd_opt = &cpi->rd;
for (i = 0; i < MAX_REF_FRAMES; i++) {
for (j = 0; j < REFERENCE_MODES; j++)
@@ -3844,21 +3936,12 @@ static void save_encode_params(VP9_COMP *cpi) {
rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
}
- if (cpi->tile_data != NULL) {
- for (tile_row = 0; tile_row < tile_rows; ++tile_row)
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- TileDataEnc *tile_data =
- &cpi->tile_data[tile_row * tile_cols + tile_col];
- for (i = 0; i < BLOCK_SIZES; ++i) {
- for (j = 0; j < MAX_MODES; ++j) {
- tile_data->thresh_freq_fact_prev[i][j] =
- tile_data->thresh_freq_fact[i][j];
- }
- }
- }
+ for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+ assert(cpi->tile_data);
+ tile_data = &cpi->tile_data[tile_idx];
+ vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact);
}
}
-#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
#ifdef ENABLE_KF_DENOISE
@@ -4005,6 +4088,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
cpi->rc.hybrid_intra_scene_change = 0;
cpi->rc.re_encode_maxq_scene_change = 0;
if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
+ !cpi->disable_scene_detection_rtc_ratectrl &&
(cpi->oxcf.rc_mode == VPX_VBR ||
cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
(cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
@@ -4067,7 +4151,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
svc->spatial_layer_id == svc->number_spatial_layers - 2) {
if (svc->prev_partition_svc == NULL) {
CHECK_MEM_ERROR(
- cm, svc->prev_partition_svc,
+ &cm->error, svc->prev_partition_svc,
(BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
sizeof(*svc->prev_partition_svc)));
}
@@ -4419,10 +4503,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
#if CONFIG_RATE_CTRL
- const FRAME_UPDATE_TYPE update_type =
- cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
- const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
- RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type];
+ RATE_QSTEP_MODEL *rq_model;
+ {
+ const FRAME_UPDATE_TYPE update_type =
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+ const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+ rq_model = &cpi->rq_model[frame_type];
+ }
init_rq_history(rq_history);
#endif // CONFIG_RATE_CTRL
@@ -4438,6 +4525,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
(cpi->twopass.gf_group.index == 1)
: 0;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame: \n");
+#endif
do {
vpx_clear_system_state();
@@ -4525,7 +4615,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
}
#endif // CONFIG_RATE_CTRL
if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
vpx_codec_err_t codec_status;
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4825,6 +4916,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
if (loop) restore_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (loop) printf("\n Recoding:");
+#endif
} while (loop);
rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
@@ -4922,13 +5016,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
filter_type, phase_scaler);
else
- scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+ vp9_scale_and_extend_frame_nonnormative(unscaled, scaled,
+ (int)cm->bit_depth);
#else
if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
unscaled->y_height <= (scaled->y_height << 1))
vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
else
- scale_and_extend_frame_nonnormative(unscaled, scaled);
+ vp9_scale_and_extend_frame_nonnormative(unscaled, scaled);
#endif // CONFIG_VP9_HIGHBITDEPTH
return scaled;
} else {
@@ -4980,8 +5075,8 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
#ifdef ENABLE_KF_DENOISE
// Baseline kernel weights for denoise
-static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
-static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
+static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 };
static INLINE void add_denoise_point(int centre_val, int data_val, int thresh,
@@ -4998,17 +5093,17 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
int sum_weight = 0;
int sum_val = 0;
int thresh = strength;
- int kernal_size = 5;
+ int kernel_size = 5;
int half_k_size = 2;
int i, j;
int max_diff = 0;
uint8_t *tmp_ptr;
- uint8_t *kernal_ptr;
+ uint8_t *kernel_ptr;
// Find the maximum deviation from the source point in the locale.
tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
- for (i = 0; i < kernal_size + 2; ++i) {
- for (j = 0; j < kernal_size + 2; ++j) {
+ for (i = 0; i < kernel_size + 2; ++i) {
+ for (j = 0; j < kernel_size + 2; ++j) {
max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j]));
}
tmp_ptr += stride;
@@ -5016,19 +5111,19 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
// Select the kernel size.
if (max_diff > (strength + (strength >> 1))) {
- kernal_size = 3;
+ kernel_size = 3;
half_k_size = 1;
thresh = thresh >> 1;
}
- kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+ kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
// Apply the kernel
tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
- for (i = 0; i < kernal_size; ++i) {
- for (j = 0; j < kernal_size; ++j) {
- add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+ for (i = 0; i < kernel_size; ++i) {
+ for (j = 0; j < kernel_size; ++j) {
+ add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
&sum_val, &sum_weight);
- ++kernal_ptr;
+ ++kernel_ptr;
}
tmp_ptr += stride;
}
@@ -5043,17 +5138,17 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
int sum_weight = 0;
int sum_val = 0;
int thresh = strength;
- int kernal_size = 5;
+ int kernel_size = 5;
int half_k_size = 2;
int i, j;
int max_diff = 0;
uint16_t *tmp_ptr;
- uint8_t *kernal_ptr;
+ uint8_t *kernel_ptr;
// Find the maximum deviation from the source point in the locale.
tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
- for (i = 0; i < kernal_size + 2; ++i) {
- for (j = 0; j < kernal_size + 2; ++j) {
+ for (i = 0; i < kernel_size + 2; ++i) {
+ for (j = 0; j < kernel_size + 2; ++j) {
max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j]));
}
tmp_ptr += stride;
@@ -5061,19 +5156,19 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
// Select the kernel size.
if (max_diff > (strength + (strength >> 1))) {
- kernal_size = 3;
+ kernel_size = 3;
half_k_size = 1;
thresh = thresh >> 1;
}
- kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+ kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
// Apply the kernel
tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
- for (i = 0; i < kernal_size; ++i) {
- for (j = 0; j < kernal_size; ++j) {
- add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+ for (i = 0; i < kernel_size; ++i) {
+ for (j = 0; j < kernel_size; ++j) {
+ add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
&sum_val, &sum_weight);
- ++kernal_ptr;
+ ++kernel_ptr;
}
tmp_ptr += stride;
}
@@ -5260,7 +5355,7 @@ static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
cpi->mb_wiener_variance = NULL;
CHECK_MEM_ERROR(
- cm, cpi->mb_wiener_variance,
+ &cm->error, cpi->mb_wiener_variance,
vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
cpi->mb_wiener_var_rows = cm->mb_rows;
cpi->mb_wiener_var_cols = cm->mb_cols;
@@ -5319,16 +5414,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) {
vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
mb_buffer, buf_stride, zero_pred, block_size,
xd->bd);
- highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+ vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
} else {
vpx_subtract_block(block_size, block_size, src_diff, block_size,
mb_buffer, buf_stride, zero_pred, block_size);
- wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+ vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
}
#else
vpx_subtract_block(block_size, block_size, src_diff, block_size,
mb_buffer, buf_stride, zero_pred, block_size);
- wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+ vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
#endif // CONFIG_VP9_HIGHBITDEPTH
coeff[0] = 0;
@@ -5447,26 +5542,7 @@ static void encode_frame_to_data_rate(
struct segmentation *const seg = &cm->seg;
TX_SIZE t;
- // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
- // No need to set svc.skip_enhancement_layer if whole superframe will be
- // dropped.
- if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
- cpi->oxcf.target_bandwidth == 0 &&
- !(cpi->svc.framedrop_mode != LAYER_DROP &&
- (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
- cpi->svc
- .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
- 1]) &&
- cpi->svc.drop_spatial_layer[0])) {
- cpi->svc.skip_enhancement_layer = 1;
- vp9_rc_postencode_update_drop_frame(cpi);
- cpi->ext_refresh_frame_flags_pending = 0;
- cpi->last_frame_dropped = 1;
- cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
- cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
- vp9_inc_frame_in_layer(cpi);
- return;
- }
+ if (vp9_svc_check_skip_enhancement_layer(cpi)) return;
set_ext_overrides(cpi);
vpx_clear_system_state();
@@ -5484,6 +5560,11 @@ static void encode_frame_to_data_rate(
set_ref_sign_bias(cpi);
}
+ // On the very first frame set the deadline_mode_previous_frame to
+ // the current mode.
+ if (cpi->common.current_video_frame == 0)
+ cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
+
// Set default state for segment based loop filter update flags.
cm->lf.mode_ref_delta_update = 0;
@@ -5531,16 +5612,11 @@ static void encode_frame_to_data_rate(
memset(cpi->mode_chosen_counts, 0,
MAX_MODES * sizeof(*cpi->mode_chosen_counts));
#endif
-#if CONFIG_CONSISTENT_RECODE
// Backup to ensure consistency between recodes
save_encode_params(cpi);
-#elif CONFIG_RATE_CTRL
- if (cpi->oxcf.use_simple_encode_api) {
- save_encode_params(cpi);
- }
-#endif
if (cpi->ext_ratectrl.ready &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+ cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) {
vpx_codec_err_t codec_status;
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
@@ -5572,8 +5648,14 @@ static void encode_frame_to_data_rate(
#if !CONFIG_REALTIME_ONLY
#if CONFIG_RATE_CTRL
encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history);
-#else // CONFIG_RATE_CTRL
+#else // CONFIG_RATE_CTRL
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_with_recode_loop_time);
+#endif
encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_with_recode_loop_time);
+#endif
#endif // CONFIG_RATE_CTRL
#endif // !CONFIG_REALTIME_ONLY
}
@@ -5632,15 +5714,28 @@ static void encode_frame_to_data_rate(
cm->frame_to_show->render_width = cm->render_width;
cm->frame_to_show->render_height = cm->render_height;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loopfilter_frame_time);
+#endif
// Pick the loop filter level for the frame.
loopfilter_frame(cpi, cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loopfilter_frame_time);
+#endif
if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_pack_bitstream_time);
+#endif
// build the bitstream
vp9_pack_bitstream(cpi, dest, size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_pack_bitstream_time);
+#endif
- if (cpi->ext_ratectrl.ready) {
+ if (cpi->ext_ratectrl.ready &&
+ cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) {
const RefCntBuffer *coded_frame_buf =
get_ref_cnt_buffer(cm, cm->new_fb_idx);
vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
@@ -6228,1391 +6323,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
}
}
-typedef struct GF_PICTURE {
- YV12_BUFFER_CONFIG *frame;
- int ref_frame[3];
- FRAME_UPDATE_TYPE update_type;
-} GF_PICTURE;
-
-static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
- const GF_GROUP *gf_group, int *tpl_group_frames) {
- VP9_COMMON *cm = &cpi->common;
- int frame_idx = 0;
- int i;
- int gld_index = -1;
- int alt_index = -1;
- int lst_index = -1;
- int arf_index_stack[MAX_ARF_LAYERS];
- int arf_stack_size = 0;
- int extend_frame_count = 0;
- int pframe_qindex = cpi->tpl_stats[2].base_qindex;
- int frame_gop_offset = 0;
-
- RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
- int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
-
- memset(recon_frame_index, -1, sizeof(recon_frame_index));
- stack_init(arf_index_stack, MAX_ARF_LAYERS);
-
- // TODO(jingning): To be used later for gf frame type parsing.
- (void)gf_group;
-
- for (i = 0; i < FRAME_BUFFERS; ++i) {
- if (frame_bufs[i].ref_count == 0) {
- alloc_frame_mvs(cm, i);
- if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif
- VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
- NULL, NULL, NULL))
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate frame buffer");
-
- recon_frame_index[frame_idx] = i;
- ++frame_idx;
-
- if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
- }
- }
-
- for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
- assert(recon_frame_index[i] >= 0);
- cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
- }
-
- *tpl_group_frames = 0;
-
- // Initialize Golden reference frame.
- gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
- for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
- gf_picture[0].update_type = gf_group->update_type[0];
- gld_index = 0;
- ++*tpl_group_frames;
-
- // Initialize base layer ARF frame
- gf_picture[1].frame = cpi->Source;
- gf_picture[1].ref_frame[0] = gld_index;
- gf_picture[1].ref_frame[1] = lst_index;
- gf_picture[1].ref_frame[2] = alt_index;
- gf_picture[1].update_type = gf_group->update_type[1];
- alt_index = 1;
- ++*tpl_group_frames;
-
- // Initialize P frames
- for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
- struct lookahead_entry *buf;
- frame_gop_offset = gf_group->frame_gop_index[frame_idx];
- buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
- if (buf == NULL) break;
-
- gf_picture[frame_idx].frame = &buf->img;
- gf_picture[frame_idx].ref_frame[0] = gld_index;
- gf_picture[frame_idx].ref_frame[1] = lst_index;
- gf_picture[frame_idx].ref_frame[2] = alt_index;
- gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
-
- switch (gf_group->update_type[frame_idx]) {
- case ARF_UPDATE:
- stack_push(arf_index_stack, alt_index, arf_stack_size);
- ++arf_stack_size;
- alt_index = frame_idx;
- break;
- case LF_UPDATE: lst_index = frame_idx; break;
- case OVERLAY_UPDATE:
- gld_index = frame_idx;
- alt_index = stack_pop(arf_index_stack, arf_stack_size);
- --arf_stack_size;
- break;
- case USE_BUF_FRAME:
- lst_index = alt_index;
- alt_index = stack_pop(arf_index_stack, arf_stack_size);
- --arf_stack_size;
- break;
- default: break;
- }
-
- ++*tpl_group_frames;
-
- // The length of group of pictures is baseline_gf_interval, plus the
- // beginning golden frame from last GOP, plus the last overlay frame in
- // the same GOP.
- if (frame_idx == gf_group->gf_group_size) break;
- }
-
- alt_index = -1;
- ++frame_idx;
- ++frame_gop_offset;
-
- // Extend two frames outside the current gf group.
- for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
- struct lookahead_entry *buf =
- vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
- if (buf == NULL) break;
-
- cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
-
- gf_picture[frame_idx].frame = &buf->img;
- gf_picture[frame_idx].ref_frame[0] = gld_index;
- gf_picture[frame_idx].ref_frame[1] = lst_index;
- gf_picture[frame_idx].ref_frame[2] = alt_index;
- gf_picture[frame_idx].update_type = LF_UPDATE;
- lst_index = frame_idx;
- ++*tpl_group_frames;
- ++extend_frame_count;
- ++frame_gop_offset;
- }
-}
-
-static void init_tpl_stats(VP9_COMP *cpi) {
- int frame_idx;
- for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- memset(tpl_frame->tpl_stats_ptr, 0,
- tpl_frame->height * tpl_frame->width *
- sizeof(*tpl_frame->tpl_stats_ptr));
- tpl_frame->is_valid = 0;
- }
-}
-
-#if CONFIG_NON_GREEDY_MV
-static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
- MotionField *motion_field,
- int frame_idx, uint8_t *cur_frame_buf,
- uint8_t *ref_frame_buf, int stride,
- BLOCK_SIZE bsize, int mi_row,
- int mi_col, MV *mv) {
- MACROBLOCK *const x = &td->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
- int step_param;
- uint32_t bestsme = UINT_MAX;
- const MvLimits tmp_mv_limits = x->mv_limits;
- // lambda is used to adjust the importance of motion vector consistency.
- // TODO(angiebird): Figure out lambda's proper value.
- const int lambda = cpi->tpl_stats[frame_idx].lambda;
- int_mv nb_full_mvs[NB_MVS_NUM];
- int nb_full_mv_num;
-
- MV best_ref_mv1 = { 0, 0 };
- MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
- best_ref_mv1_full.col = best_ref_mv1.col >> 3;
- best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
- // Setup frame pointers
- x->plane[0].src.buf = cur_frame_buf;
- x->plane[0].src.stride = stride;
- xd->plane[0].pre[0].buf = ref_frame_buf;
- xd->plane[0].pre[0].stride = stride;
-
- step_param = mv_sf->reduce_first_step_size;
- step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
- vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
- nb_full_mv_num =
- vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
- vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
- lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
-
- /* restore UMV window */
- x->mv_limits = tmp_mv_limits;
-
- return bestsme;
-}
-
-static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
- uint8_t *cur_frame_buf,
- uint8_t *ref_frame_buf, int stride,
- BLOCK_SIZE bsize, MV *mv) {
- MACROBLOCK *const x = &td->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
- uint32_t bestsme = UINT_MAX;
- uint32_t distortion;
- uint32_t sse;
- int cost_list[5];
-
- MV best_ref_mv1 = { 0, 0 };
-
- // Setup frame pointers
- x->plane[0].src.buf = cur_frame_buf;
- x->plane[0].src.stride = stride;
- xd->plane[0].pre[0].buf = ref_frame_buf;
- xd->plane[0].pre[0].stride = stride;
-
- // TODO(yunqing): may use higher tap interp filter than 2 taps.
- // Ignore mv costing by sending NULL pointer instead of cost array
- bestsme = cpi->find_fractional_mv_step(
- x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
- &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
- cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
- USE_2_TAPS);
-
- return bestsme;
-}
-
-#else // CONFIG_NON_GREEDY_MV
-static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
- uint8_t *cur_frame_buf,
- uint8_t *ref_frame_buf,
- int stride, BLOCK_SIZE bsize,
- MV *mv) {
- MACROBLOCK *const x = &td->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
- const SEARCH_METHODS search_method = NSTEP;
- int step_param;
- int sadpb = x->sadperbit16;
- uint32_t bestsme = UINT_MAX;
- uint32_t distortion;
- uint32_t sse;
- int cost_list[5];
- const MvLimits tmp_mv_limits = x->mv_limits;
-
- MV best_ref_mv1 = { 0, 0 };
- MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
- best_ref_mv1_full.col = best_ref_mv1.col >> 3;
- best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
- // Setup frame pointers
- x->plane[0].src.buf = cur_frame_buf;
- x->plane[0].src.stride = stride;
- xd->plane[0].pre[0].buf = ref_frame_buf;
- xd->plane[0].pre[0].stride = stride;
-
- step_param = mv_sf->reduce_first_step_size;
- step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
- vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
- vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
- search_method, sadpb, cond_cost_list(cpi, cost_list),
- &best_ref_mv1, mv, 0, 0);
-
- /* restore UMV window */
- x->mv_limits = tmp_mv_limits;
-
- // TODO(yunqing): may use higher tap interp filter than 2 taps.
- // Ignore mv costing by sending NULL pointer instead of cost array
- bestsme = cpi->find_fractional_mv_step(
- x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
- &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
- cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
- USE_2_TAPS);
-
- return bestsme;
-}
-#endif
-
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
- int ref_pos_col, int block, BLOCK_SIZE bsize) {
- int width = 0, height = 0;
- int bw = 4 << b_width_log2_lookup[bsize];
- int bh = 4 << b_height_log2_lookup[bsize];
-
- switch (block) {
- case 0:
- width = grid_pos_col + bw - ref_pos_col;
- height = grid_pos_row + bh - ref_pos_row;
- break;
- case 1:
- width = ref_pos_col + bw - grid_pos_col;
- height = grid_pos_row + bh - ref_pos_row;
- break;
- case 2:
- width = grid_pos_col + bw - ref_pos_col;
- height = ref_pos_row + bh - grid_pos_row;
- break;
- case 3:
- width = ref_pos_col + bw - grid_pos_col;
- height = ref_pos_row + bh - grid_pos_row;
- break;
- default: assert(0);
- }
-
- return width * height;
-}
-
-static int round_floor(int ref_pos, int bsize_pix) {
- int round;
- if (ref_pos < 0)
- round = -(1 + (-ref_pos - 1) / bsize_pix);
- else
- round = ref_pos / bsize_pix;
-
- return round;
-}
-
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int stride) {
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
- int idx, idy;
-
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
- const int64_t mc_flow = tpl_ptr->mc_flow;
- const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
- *tpl_ptr = *src_stats;
- tpl_ptr->mc_flow = mc_flow;
- tpl_ptr->mc_ref_cost = mc_ref_cost;
- tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
- }
- }
-}
-
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
- int mi_row, int mi_col, const BLOCK_SIZE bsize) {
- TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
- TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
- MV mv = tpl_stats->mv.as_mv;
- int mv_row = mv.row >> 3;
- int mv_col = mv.col >> 3;
-
- int ref_pos_row = mi_row * MI_SIZE + mv_row;
- int ref_pos_col = mi_col * MI_SIZE + mv_col;
-
- const int bw = 4 << b_width_log2_lookup[bsize];
- const int bh = 4 << b_height_log2_lookup[bsize];
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- const int pix_num = bw * bh;
-
- // top-left on grid block location in pixel
- int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
- int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
- int block;
-
- for (block = 0; block < 4; ++block) {
- int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
- int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
-
- if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
- grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
- int overlap_area = get_overlap_area(
- grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
- int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
- int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-
- int64_t mc_flow = tpl_stats->mc_dep_cost -
- (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
- tpl_stats->intra_cost;
-
- int idx, idy;
-
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *des_stats =
- &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
- (ref_mi_col + idx)];
-
- des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
- des_stats->mc_ref_cost +=
- ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
- pix_num;
- assert(overlap_area >= 0);
- }
- }
- }
- }
-}
-
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
- int mi_row, int mi_col, const BLOCK_SIZE bsize) {
- int idx, idy;
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *tpl_ptr =
- &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
- tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
- BLOCK_8X8);
- }
- }
-}
-
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
- tran_low_t *qcoeff, tran_low_t *dqcoeff,
- TX_SIZE tx_size, int64_t *recon_error,
- int64_t *sse) {
- MACROBLOCKD *const xd = &x->e_mbd;
- const struct macroblock_plane *const p = &x->plane[plane];
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
- uint16_t eob;
- int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
- const int shift = tx_size == TX_32X32 ? 0 : 2;
-
- // skip block condition should be handled before this is called.
- assert(!x->skip_block);
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
- qcoeff, dqcoeff, pd->dequant, &eob,
- scan_order->scan, scan_order->iscan);
- } else {
- vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, &eob, scan_order->scan,
- scan_order->iscan);
- }
-#else
- vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, &eob, scan_order->scan,
- scan_order->iscan);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
- *recon_error = VPXMAX(*recon_error, 1);
-
- *sse = (*sse) >> shift;
- *sse = VPXMAX(*sse, 1);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
- TX_SIZE tx_size) {
- // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
- switch (tx_size) {
- case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
- case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
- case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
- default: assert(0);
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
- TX_SIZE tx_size) {
- switch (tx_size) {
- case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
- case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
- case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
- default: assert(0);
- }
-}
-
-static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
- int mi_col) {
- x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
- x->mv_limits.row_max =
- (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
- x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
- x->mv_limits.col_max =
- ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
-}
-
-static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
- struct scale_factors *sf, GF_PICTURE *gf_picture,
- int frame_idx, TplDepFrame *tpl_frame,
- int16_t *src_diff, tran_low_t *coeff,
- tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
- int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
- YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
- int64_t *recon_error, int64_t *sse) {
- VP9_COMMON *cm = &cpi->common;
- ThreadData *td = &cpi->td;
-
- const int bw = 4 << b_width_log2_lookup[bsize];
- const int bh = 4 << b_height_log2_lookup[bsize];
- const int pix_num = bw * bh;
- int best_rf_idx = -1;
- int_mv best_mv;
- int64_t best_inter_cost = INT64_MAX;
- int64_t inter_cost;
- int rf_idx;
- const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
-
- int64_t best_intra_cost = INT64_MAX;
- int64_t intra_cost;
- PREDICTION_MODE mode;
- int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
- MODE_INFO mi_above, mi_left;
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- TplDepStats *tpl_stats =
- &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-
- xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
- xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
- xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
- xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
- xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
- xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
-
- // Intra prediction search
- for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
- uint8_t *src, *dst;
- int src_stride, dst_stride;
-
- src = xd->cur_buf->y_buffer + mb_y_offset;
- src_stride = xd->cur_buf->y_stride;
-
- dst = &predictor[0];
- dst_stride = bw;
-
- xd->mi[0]->sb_type = bsize;
- xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
- vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
- src_stride, dst, dst_stride, 0, 0, 0);
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
- dst_stride, xd->bd);
- highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- intra_cost = vpx_highbd_satd(coeff, pix_num);
- } else {
- vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
- dst_stride);
- wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- intra_cost = vpx_satd(coeff, pix_num);
- }
-#else
- vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
- wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- intra_cost = vpx_satd(coeff, pix_num);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
- }
-
- // Motion compensated prediction
- best_mv.as_int = 0;
-
- set_mv_limits(cm, x, mi_row, mi_col);
-
- for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
- int_mv mv;
-#if CONFIG_NON_GREEDY_MV
- MotionField *motion_field;
-#endif
- if (ref_frame[rf_idx] == NULL) continue;
-
-#if CONFIG_NON_GREEDY_MV
- (void)td;
- motion_field = vp9_motion_field_info_get_motion_field(
- &cpi->motion_field_info, frame_idx, rf_idx, bsize);
- mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-#else
- motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
- ref_frame[rf_idx]->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, bsize, &mv.as_mv);
-#endif
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_build_inter_predictor(
- CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
- ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
- &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
- mi_row * MI_SIZE, xd->bd);
- vpx_highbd_subtract_block(
- bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
- highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- inter_cost = vpx_highbd_satd(coeff, pix_num);
- } else {
- vp9_build_inter_predictor(
- ref_frame[rf_idx]->y_buffer + mb_y_offset,
- ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
- 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
- vpx_subtract_block(bh, bw, src_diff, bw,
- xd->cur_buf->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, &predictor[0], bw);
- wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- inter_cost = vpx_satd(coeff, pix_num);
- }
-#else
- vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
- ref_frame[rf_idx]->y_stride, &predictor[0], bw,
- &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
- mi_col * MI_SIZE, mi_row * MI_SIZE);
- vpx_subtract_block(bh, bw, src_diff, bw,
- xd->cur_buf->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, &predictor[0], bw);
- wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- inter_cost = vpx_satd(coeff, pix_num);
-#endif
-
- if (inter_cost < best_inter_cost) {
- best_rf_idx = rf_idx;
- best_inter_cost = inter_cost;
- best_mv.as_int = mv.as_int;
- get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
- sse);
- }
- }
- best_intra_cost = VPXMAX(best_intra_cost, 1);
- best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
- tpl_stats->inter_cost = VPXMAX(
- 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
- tpl_stats->intra_cost = VPXMAX(
- 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
- tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
- tpl_stats->mv.as_int = best_mv.as_int;
-}
-
-#if CONFIG_NON_GREEDY_MV
-static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
- int frame_idx, int rf_idx, int mi_row,
- int mi_col, struct buf_2d *src,
- struct buf_2d *pre) {
- const int mb_y_offset =
- mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
- YV12_BUFFER_CONFIG *ref_frame = NULL;
- int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
- if (ref_frame_idx != -1) {
- ref_frame = gf_picture[ref_frame_idx].frame;
- src->buf = xd->cur_buf->y_buffer + mb_y_offset;
- src->stride = xd->cur_buf->y_stride;
- pre->buf = ref_frame->y_buffer + mb_y_offset;
- pre->stride = ref_frame->y_stride;
- assert(src->stride == pre->stride);
- return 1;
- } else {
- printf("invalid ref_frame_idx");
- assert(ref_frame_idx != -1);
- return 0;
- }
-}
-
-#define kMvPreCheckLines 5
-#define kMvPreCheckSize 15
-
-#define MV_REF_POS_NUM 3
-POSITION mv_ref_pos[MV_REF_POS_NUM] = {
- { -1, 0 },
- { 0, -1 },
- { -1, -1 },
-};
-
-static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
- int mi_col) {
- return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
-}
-
-static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
- BLOCK_SIZE bsize, int mi_row, int mi_col) {
- int i;
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- int_mv nearest_mv, near_mv, invalid_mv;
- nearest_mv.as_int = INVALID_MV;
- near_mv.as_int = INVALID_MV;
- invalid_mv.as_int = INVALID_MV;
- for (i = 0; i < MV_REF_POS_NUM; ++i) {
- int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
- int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
- assert(mv_ref_pos[i].row <= 0);
- assert(mv_ref_pos[i].col <= 0);
- if (nb_row >= 0 && nb_col >= 0) {
- if (nearest_mv.as_int == INVALID_MV) {
- nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
- } else {
- int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
- if (mv.as_int == nearest_mv.as_int) {
- continue;
- } else {
- near_mv = mv;
- break;
- }
- }
- }
- }
- if (nearest_mv.as_int == INVALID_MV) {
- nearest_mv.as_mv.row = 0;
- nearest_mv.as_mv.col = 0;
- }
- if (near_mv.as_int == INVALID_MV) {
- near_mv.as_mv.row = 0;
- near_mv.as_mv.col = 0;
- }
- if (mv_mode == NEAREST_MV_MODE) {
- return nearest_mv;
- }
- if (mv_mode == NEAR_MV_MODE) {
- return near_mv;
- }
- assert(0);
- return invalid_mv;
-}
-
-static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
- MotionField *motion_field,
- TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
- int mi_row, int mi_col) {
- int_mv mv;
- switch (mv_mode) {
- case ZERO_MV_MODE:
- mv.as_mv.row = 0;
- mv.as_mv.col = 0;
- break;
- case NEW_MV_MODE:
- mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
- break;
- case NEAREST_MV_MODE:
- mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
- break;
- case NEAR_MV_MODE:
- mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
- break;
- default:
- mv.as_int = INVALID_MV;
- assert(0);
- break;
- }
- return mv;
-}
-
-static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
- GF_PICTURE *gf_picture, MotionField *motion_field,
- int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- int_mv *mv) {
- uint32_t sse;
- struct buf_2d src;
- struct buf_2d pre;
- MV full_mv;
- *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
- mi_row, mi_col);
- full_mv = get_full_mv(&mv->as_mv);
- if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
- &src, &pre)) {
- // TODO(angiebird): Consider subpixel when computing the sse.
- cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
- pre.stride, &sse);
- return (double)(sse << VP9_DIST_SCALE_LOG2);
- } else {
- assert(0);
- return 0;
- }
-}
-
-static int get_mv_mode_cost(int mv_mode) {
- // TODO(angiebird): The probabilities are roughly inferred from
- // default_inter_mode_probs. Check if there is a better way to set the
- // probabilities.
- const int zero_mv_prob = 16;
- const int new_mv_prob = 24 * 1;
- const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
- assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
- switch (mv_mode) {
- case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
- case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
- case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
- case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
- default: assert(0); return -1;
- }
-}
-
-static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
- double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
- log2(1 + abs(new_mv->col - ref_mv->col));
- mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
- return mv_diff_cost;
-}
-static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
- TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
- int mi_col) {
- double mv_cost = get_mv_mode_cost(mv_mode);
- if (mv_mode == NEW_MV_MODE) {
- MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
- bsize, mi_row, mi_col)
- .as_mv;
- MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
- tpl_frame, bsize, mi_row, mi_col)
- .as_mv;
- MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
- bsize, mi_row, mi_col)
- .as_mv;
- double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
- double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
- mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
- }
- return mv_cost;
-}
-
-static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
- GF_PICTURE *gf_picture, MotionField *motion_field,
- int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- int_mv *mv) {
- MACROBLOCKD *xd = &x->e_mbd;
- double mv_dist =
- get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
- tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
- double mv_cost =
- get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
- double mult = 180;
-
- return mv_cost + mult * log2f(1 + mv_dist);
-}
-
-static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
- GF_PICTURE *gf_picture,
- MotionField *motion_field, int frame_idx,
- TplDepFrame *tpl_frame, int rf_idx,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- double *rd, int_mv *mv) {
- int best_mv_mode = ZERO_MV_MODE;
- int update = 0;
- int mv_mode;
- *rd = 0;
- for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
- double this_rd;
- int_mv this_mv;
- if (mv_mode == NEW_MV_MODE) {
- continue;
- }
- this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
- tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
- if (update == 0) {
- *rd = this_rd;
- *mv = this_mv;
- best_mv_mode = mv_mode;
- update = 1;
- } else {
- if (this_rd < *rd) {
- *rd = this_rd;
- *mv = this_mv;
- best_mv_mode = mv_mode;
- }
- }
- }
- return best_mv_mode;
-}
-
-static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
- GF_PICTURE *gf_picture, MotionField *motion_field,
- int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
- BLOCK_SIZE bsize, int mi_row, int mi_col) {
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- int tmp_mv_mode_arr[kMvPreCheckSize];
- int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
- double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
- int_mv *select_mv_arr = cpi->select_mv_arr;
- int_mv tmp_select_mv_arr[kMvPreCheckSize];
- int stride = tpl_frame->stride;
- double new_mv_rd = 0;
- double no_new_mv_rd = 0;
- double this_new_mv_rd = 0;
- double this_no_new_mv_rd = 0;
- int idx;
- int tmp_idx;
- assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
-
- // no new mv
- // diagonal scan order
- tmp_idx = 0;
- for (idx = 0; idx < kMvPreCheckLines; ++idx) {
- int r;
- for (r = 0; r <= idx; ++r) {
- int c = idx - r;
- int nb_row = mi_row + r * mi_height;
- int nb_col = mi_col + c * mi_width;
- if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
- double this_rd;
- int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
- mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
- cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
- bsize, nb_row, nb_col, &this_rd, mv);
- if (r == 0 && c == 0) {
- this_no_new_mv_rd = this_rd;
- }
- no_new_mv_rd += this_rd;
- tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
- tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
- ++tmp_idx;
- }
- }
- }
-
- // new mv
- mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
- this_new_mv_rd = eval_mv_mode(
- NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
- rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
- new_mv_rd = this_new_mv_rd;
- // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
- // beforehand.
- for (idx = 1; idx < kMvPreCheckLines; ++idx) {
- int r;
- for (r = 0; r <= idx; ++r) {
- int c = idx - r;
- int nb_row = mi_row + r * mi_height;
- int nb_col = mi_col + c * mi_width;
- if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
- double this_rd;
- int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
- mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
- cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
- bsize, nb_row, nb_col, &this_rd, mv);
- new_mv_rd += this_rd;
- }
- }
- }
-
- // update best_mv_mode
- tmp_idx = 0;
- if (no_new_mv_rd < new_mv_rd) {
- for (idx = 0; idx < kMvPreCheckLines; ++idx) {
- int r;
- for (r = 0; r <= idx; ++r) {
- int c = idx - r;
- int nb_row = mi_row + r * mi_height;
- int nb_col = mi_col + c * mi_width;
- if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
- mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
- select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
- ++tmp_idx;
- }
- }
- }
- rd_diff_arr[mi_row * stride + mi_col] = 0;
- } else {
- rd_diff_arr[mi_row * stride + mi_col] =
- (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
- }
-}
-
-static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
- GF_PICTURE *gf_picture,
- MotionField *motion_field, int frame_idx,
- TplDepFrame *tpl_frame, int rf_idx,
- BLOCK_SIZE bsize) {
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- const int unit_rows = tpl_frame->mi_rows / mi_height;
- const int unit_cols = tpl_frame->mi_cols / mi_width;
- const int max_diagonal_lines = unit_rows + unit_cols - 1;
- int idx;
- for (idx = 0; idx < max_diagonal_lines; ++idx) {
- int r;
- for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
- ++r) {
- int c = idx - r;
- int mi_row = r * mi_height;
- int mi_col = c * mi_width;
- assert(c >= 0 && c < unit_cols);
- assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
- assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
- predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
- rf_idx, bsize, mi_row, mi_col);
- }
- }
-}
-
-static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
- MotionField *motion_field, int frame_idx,
- YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
- int mi_row, int mi_col) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCK *x = &td->mb;
- MACROBLOCKD *xd = &x->e_mbd;
- const int mb_y_offset =
- mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
- assert(ref_frame != NULL);
- set_mv_limits(cm, x, mi_row, mi_col);
- {
- int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
- uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
- uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
- const int stride = xd->cur_buf->y_stride;
- full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
- ref_frame_buf, stride, bsize, mi_row, mi_col,
- &mv.as_mv);
- sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
- bsize, &mv.as_mv);
- vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
- }
-}
-
-static void build_motion_field(
- VP9_COMP *cpi, int frame_idx,
- YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
- VP9_COMMON *cm = &cpi->common;
- ThreadData *td = &cpi->td;
- TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
- const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
- int mi_row, mi_col;
- int rf_idx;
-
- tpl_frame->lambda = (pw * ph) >> 2;
- assert(pw * ph == tpl_frame->lambda << 2);
-
- for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
- MotionField *motion_field = vp9_motion_field_info_get_motion_field(
- &cpi->motion_field_info, frame_idx, rf_idx, bsize);
- if (ref_frame[rf_idx] == NULL) {
- continue;
- }
- vp9_motion_field_reset_mvs(motion_field);
- for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
- do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
- bsize, mi_row, mi_col);
- }
- }
- }
-}
-#endif // CONFIG_NON_GREEDY_MV
-
-static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
- int frame_idx, BLOCK_SIZE bsize) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
- YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
-
- VP9_COMMON *cm = &cpi->common;
- struct scale_factors sf;
- int rdmult, idx;
- ThreadData *td = &cpi->td;
- MACROBLOCK *x = &td->mb;
- MACROBLOCKD *xd = &x->e_mbd;
- int mi_row, mi_col;
-
-#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
- DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
- uint8_t *predictor;
-#else
- DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
-#endif
- DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
- DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
- DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-
- const TX_SIZE tx_size = max_txsize_lookup[bsize];
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- int64_t recon_error, sse;
-#if CONFIG_NON_GREEDY_MV
- int square_block_idx;
- int rf_idx;
-#endif
-
- // Setup scaling factor
-#if CONFIG_VP9_HIGHBITDEPTH
- vp9_setup_scale_factors_for_frame(
- &sf, this_frame->y_crop_width, this_frame->y_crop_height,
- this_frame->y_crop_width, this_frame->y_crop_height,
- cpi->common.use_highbitdepth);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- predictor = CONVERT_TO_BYTEPTR(predictor16);
- else
- predictor = predictor8;
-#else
- vp9_setup_scale_factors_for_frame(
- &sf, this_frame->y_crop_width, this_frame->y_crop_height,
- this_frame->y_crop_width, this_frame->y_crop_height);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- // Prepare reference frame pointers. If any reference frame slot is
- // unavailable, the pointer will be set to Null.
- for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
- int rf_idx = gf_picture[frame_idx].ref_frame[idx];
- if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
- }
-
- xd->mi = cm->mi_grid_visible;
- xd->mi[0] = cm->mi;
- xd->cur_buf = this_frame;
-
- // Get rd multiplier set up.
- rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
- set_error_per_bit(&cpi->td.mb, rdmult);
- vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
-
- tpl_frame->is_valid = 1;
-
- cm->base_qindex = tpl_frame->base_qindex;
- vp9_frame_init_quantizer(cpi);
-
-#if CONFIG_NON_GREEDY_MV
- for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
- ++square_block_idx) {
- BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
- build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
- }
- for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
- int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
- if (ref_frame_idx != -1) {
- MotionField *motion_field = vp9_motion_field_info_get_motion_field(
- &cpi->motion_field_info, frame_idx, rf_idx, bsize);
- predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
- tpl_frame, rf_idx, bsize);
- }
- }
-#endif
-
- for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
- mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
- src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
- tx_size, ref_frame, predictor, &recon_error, &sse);
- // Motion flow dependency dispenser.
- tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
- tpl_frame->stride);
-
- tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
- bsize);
- }
- }
-}
-
-#if CONFIG_NON_GREEDY_MV
-#define DUMP_TPL_STATS 0
-#if DUMP_TPL_STATS
-static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
- int i, j;
- printf("%d %d\n", h, w);
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- printf("%d ", buf[(row + i) * stride + col + j]);
- }
- }
- printf("\n");
-}
-
-static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
- dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
- frame_buf->y_width);
- dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
- frame_buf->uv_height, frame_buf->uv_width);
- dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
- frame_buf->uv_height, frame_buf->uv_width);
-}
-
-static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
- const GF_GROUP *gf_group,
- const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
- int frame_idx;
- const VP9_COMMON *cm = &cpi->common;
- int rf_idx;
- for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
- for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
- const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- int mi_row, mi_col;
- int ref_frame_idx;
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
- if (ref_frame_idx != -1) {
- YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
- const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
- const int ref_gf_frame_offset =
- gf_group->frame_gop_index[ref_frame_idx];
- printf("=\n");
- printf(
- "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
- "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
- frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
- ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
- for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
- for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
- if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
- int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
- frame_idx, rf_idx, bsize,
- mi_row, mi_col);
- printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
- mv.as_mv.col);
- }
- }
- }
- for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
- for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
- if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
- const TplDepStats *tpl_ptr =
- &tpl_frame
- ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
- printf("%f ", tpl_ptr->feature_score);
- }
- }
- }
- printf("\n");
-
- for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
- const int mv_mode =
- tpl_frame
- ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
- printf("%d ", mv_mode);
- }
- }
- printf("\n");
-
- dump_frame_buf(gf_picture[frame_idx].frame);
- dump_frame_buf(ref_frame_buf);
- }
- }
- }
-}
-#endif // DUMP_TPL_STATS
-#endif // CONFIG_NON_GREEDY_MV
-
-static void init_tpl_buffer(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- int frame;
-
- const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
- const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
-#if CONFIG_NON_GREEDY_MV
- int rf_idx;
-
- vpx_free(cpi->select_mv_arr);
- CHECK_MEM_ERROR(
- cm, cpi->select_mv_arr,
- vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
-#endif
-
- // TODO(jingning): Reduce the actual memory use for tpl model build up.
- for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
- if (cpi->tpl_stats[frame].width >= mi_cols &&
- cpi->tpl_stats[frame].height >= mi_rows &&
- cpi->tpl_stats[frame].tpl_stats_ptr)
- continue;
-
-#if CONFIG_NON_GREEDY_MV
- for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
- vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
- CHECK_MEM_ERROR(
- cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
- vpx_calloc(mi_rows * mi_cols * 4,
- sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
- vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
- CHECK_MEM_ERROR(
- cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
- vpx_calloc(mi_rows * mi_cols * 4,
- sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
- }
-#endif
- vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
- CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
- vpx_calloc(mi_rows * mi_cols,
- sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
- cpi->tpl_stats[frame].is_valid = 0;
- cpi->tpl_stats[frame].width = mi_cols;
- cpi->tpl_stats[frame].height = mi_rows;
- cpi->tpl_stats[frame].stride = mi_cols;
- cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
- cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
- }
-
- for (frame = 0; frame < REF_FRAMES; ++frame) {
- cpi->enc_frame_buf[frame].mem_valid = 0;
- cpi->enc_frame_buf[frame].released = 1;
- }
-}
-
-static void free_tpl_buffer(VP9_COMP *cpi) {
- int frame;
-#if CONFIG_NON_GREEDY_MV
- vp9_free_motion_field_info(&cpi->motion_field_info);
- vpx_free(cpi->select_mv_arr);
-#endif
- for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-#if CONFIG_NON_GREEDY_MV
- int rf_idx;
- for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
- vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
- vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
- }
-#endif
- vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
- cpi->tpl_stats[frame].is_valid = 0;
- }
-}
-
-#if CONFIG_RATE_CTRL
-static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- int show_frame_count = 0;
- int frame_idx;
- // Accumulate tpl stats for each frame in the current group of picture.
- for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
- const int tpl_stride = tpl_frame->stride;
- int64_t intra_cost_base = 0;
- int64_t inter_cost_base = 0;
- int64_t mc_dep_cost_base = 0;
- int64_t mc_ref_cost_base = 0;
- int64_t mc_flow_base = 0;
- int row, col;
-
- if (!tpl_frame->is_valid) continue;
-
- for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
- for (col = 0; col < cm->mi_cols; ++col) {
- TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
- intra_cost_base += this_stats->intra_cost;
- inter_cost_base += this_stats->inter_cost;
- mc_dep_cost_base += this_stats->mc_dep_cost;
- mc_ref_cost_base += this_stats->mc_ref_cost;
- mc_flow_base += this_stats->mc_flow;
- }
- }
-
- cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
- cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
- cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
- cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
- cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
-
- ++show_frame_count;
- }
-}
-#endif // CONFIG_RATE_CTRL
-
-static void setup_tpl_stats(VP9_COMP *cpi) {
- GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
- const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- int tpl_group_frames = 0;
- int frame_idx;
- cpi->tpl_bsize = BLOCK_32X32;
-
- init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
-
- init_tpl_stats(cpi);
-
- // Backward propagation from tpl_group_frames to 1.
- for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
- if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
- mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
- }
-#if CONFIG_NON_GREEDY_MV
- cpi->tpl_ready = 1;
-#if DUMP_TPL_STATS
- dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
-#endif // DUMP_TPL_STATS
-#endif // CONFIG_NON_GREEDY_MV
-
-#if CONFIG_RATE_CTRL
- if (cpi->oxcf.use_simple_encode_api) {
- accumulate_frame_tpl_stats(cpi);
- }
-#endif // CONFIG_RATE_CTRL
-}
-
void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
int *ref_frame_coding_indexes,
@@ -7663,6 +6373,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
const int gf_group_index = cpi->twopass.gf_group.index;
int i;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time);
+#endif
+
if (is_one_pass_svc(cpi)) {
vp9_one_pass_svc_start_layer(cpi);
}
@@ -7727,9 +6441,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_temporal_filter_time);
+#endif
// Produce the filtered ARF frame.
vp9_temporal_filter(cpi, arf_src_index);
vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_temporal_filter_time);
+#endif
// for small bitrates segmentation overhead usually
// eats all bitrate gain from enabling delta quantizers
@@ -7843,7 +6563,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#if !CONFIG_REALTIME_ONLY
if ((oxcf->pass == 2) && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
vp9_rc_get_second_pass_params(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
} else if (oxcf->pass == 1) {
set_frame_size(cpi);
}
@@ -7864,7 +6590,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
pthread_mutex_init(&cpi->kmeans_mutex, NULL);
#endif
CHECK_MEM_ERROR(
- cm, cpi->kmeans_data_arr,
+ &cm->error, cpi->kmeans_data_arr,
vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
cpi->kmeans_data_stride = mi_cols;
cpi->kmeans_data_arr_alloc = 1;
@@ -7883,13 +6609,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
#endif // CONFIG_NON_GREEDY_MV
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, setup_tpl_stats_time);
+#endif
if (gf_group_index == 1 &&
cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
cpi->sf.enable_tpl_model) {
- init_tpl_buffer(cpi);
+ vp9_init_tpl_buffer(cpi);
vp9_estimate_qp_gop(cpi);
- setup_tpl_stats(cpi);
+ vp9_setup_tpl_stats(cpi);
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, setup_tpl_stats_time);
+#endif
#if CONFIG_BITSTREAM_DEBUG
assert(cpi->oxcf.max_threads == 0 &&
@@ -7926,8 +6658,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
vp9_first_pass(cpi, source);
} else if (oxcf->pass == 2 && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // Accumulate 2nd pass time in 2-pass case.
+ start_timing(cpi, Pass2Encode_time);
+#endif
Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result);
vp9_twopass_postencode_update(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, Pass2Encode_time);
+#endif
} else if (cpi->use_svc) {
SvcEncode(cpi, size, dest, frame_flags);
} else {
@@ -8130,6 +6869,41 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time);
+
+ // Print out timing information.
+ // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+ // show_existing_frame and lag-in-frames.
+ // if (cpi->frame_component_time[0] > 100)
+ if (oxcf->pass == 2) {
+ uint64_t frame_total = 0, total = 0;
+ int i;
+
+ fprintf(stderr,
+ "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
+ cm->current_video_frame, get_frame_type_enum(cm->frame_type),
+ cm->show_frame, cm->base_qindex);
+ for (i = 0; i < kTimingComponents; i++) {
+ cpi->component_time[i] += cpi->frame_component_time[i];
+ // Use vp9_get_compressed_data_time (i = 0) as the total time.
+ if (i == 0) {
+ frame_total = cpi->frame_component_time[0];
+ total = cpi->component_time[0];
+ }
+ fprintf(stderr,
+ " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+ " us [%6.2f%%])\n",
+ get_component_name(i), cpi->frame_component_time[i],
+ (float)((float)cpi->frame_component_time[i] * 100.0 /
+ (float)frame_total),
+ cpi->component_time[i],
+ (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+ cpi->frame_component_time[i] = 0;
+ }
+ }
+#endif
+
if (is_one_pass_svc(cpi)) {
if (cm->show_frame) {
++cpi->svc.spatial_layer_to_encode;
@@ -8172,12 +6946,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
}
}
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
- VPX_SCALING vert_mode) {
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+ VPX_SCALING_MODE vert_mode) {
VP9_COMMON *cm = &cpi->common;
int hr = 0, hs = 0, vr = 0, vs = 0;
- if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+ if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1;
Scale2Ratio(horiz_mode, &hr, &hs);
Scale2Ratio(vert_mode, &vr, &vs);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index cca8b53f8..7136f7faa 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -14,9 +14,11 @@
#include <stdio.h>
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vpx_ext_ratectrl.h"
#include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
#if CONFIG_INTERNAL_STATS
#include "vpx_dsp/ssim.h"
#endif
@@ -91,13 +93,6 @@ typedef enum {
} ENCODE_BREAKOUT_TYPE;
typedef enum {
- NORMAL = 0,
- FOURFIVE = 1,
- THREEFIVE = 2,
- ONETWO = 3
-} VPX_SCALING;
-
-typedef enum {
// Good Quality Fast Encoding. The encoder balances quality with the amount of
// time it takes to encode the output. Speed setting controls how fast.
GOOD,
@@ -336,15 +331,14 @@ typedef struct TplDepFrame {
typedef struct TileDataEnc {
TileInfo tile_info;
int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
-#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int8_t mode_map[BLOCK_SIZES][MAX_MODES];
FIRSTPASS_DATA fp_data;
VP9RowMTSync row_mt_sync;
// Used for adaptive_rd_thresh with row multithreading
int *row_base_thresh_freq_fact;
+ MV firstpass_top_mv;
} TileDataEnc;
typedef struct RowMTInfo {
@@ -513,6 +507,7 @@ typedef struct EncFrameBuf {
} EncFrameBuf;
// Maximum operating frame buffer size needed for a GOP using ARF reference.
+// This is used to allocate the memory for TPL stats for a GOP.
#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
#define MAX_KMEANS_GROUPS 8
@@ -659,6 +654,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
#endif // CONFIG_RATE_CTRL
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+ vp9_get_compressed_data_time,
+ vp9_temporal_filter_time,
+ vp9_rc_get_second_pass_params_time,
+ setup_tpl_stats_time,
+ Pass2Encode_time,
+
+ encode_with_recode_loop_time,
+ loopfilter_frame_time,
+ vp9_pack_bitstream_time,
+
+ encode_frame_internal_time,
+ rd_pick_partition_time,
+ rd_pick_sb_modes_time,
+ encode_sb_time,
+
+ vp9_rd_pick_inter_mode_sb_time,
+ vp9_rd_pick_inter_mode_sub8x8_time,
+
+ intra_mode_search_time,
+ handle_inter_mode_time,
+ single_motion_search_time,
+ joint_motion_search_time,
+ interp_filter_time,
+
+ kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+ switch (index) {
+ case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+ case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+ case vp9_rc_get_second_pass_params_time:
+ return "vp9_rc_get_second_pass_params_time";
+ case setup_tpl_stats_time: return "setup_tpl_stats_time";
+ case Pass2Encode_time: return "Pass2Encode_time";
+
+ case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+ case loopfilter_frame_time: return "loopfilter_frame_time";
+ case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+ case encode_frame_internal_time: return "encode_frame_internal_time";
+ case rd_pick_partition_time: return "rd_pick_partition_time";
+ case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+ case encode_sb_time: return "encode_sb_time";
+
+ case vp9_rd_pick_inter_mode_sb_time:
+ return "vp9_rd_pick_inter_mode_sb_time";
+ case vp9_rd_pick_inter_mode_sub8x8_time:
+ return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+ case intra_mode_search_time: return "intra_mode_search_time";
+ case handle_inter_mode_time: return "handle_inter_mode_time";
+ case single_motion_search_time: return "single_motion_search_time";
+ case joint_motion_search_time: return "joint_motion_search_time";
+ case interp_filter_time: return "interp_filter_time";
+
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
typedef struct VP9_COMP {
FRAME_INFO frame_info;
QUANTS quants;
@@ -685,6 +746,8 @@ typedef struct VP9_COMP {
BLOCK_SIZE tpl_bsize;
TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+ // Used to store TPL stats before propagation
+ VpxTplGopStats tpl_gop_stats;
YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
EncFrameBuf enc_frame_buf[REF_FRAMES];
#if CONFIG_MULTITHREAD
@@ -784,7 +847,7 @@ typedef struct VP9_COMP {
uint8_t *skin_map;
- // segment threashold for encode breakout
+ // segment threshold for encode breakout
int segment_encode_breakout[MAX_SEGMENTS];
CYCLIC_REFRESH *cyclic_refresh;
@@ -858,12 +921,15 @@ typedef struct VP9_COMP {
// number of MBs in the current frame when the frame is
// scaled.
+ int last_coded_width;
+ int last_coded_height;
+
int use_svc;
SVC svc;
// Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
- diff *source_diff_var;
+ Diff *source_diff_var;
// The threshold used in SOURCE_VAR_BASED_PARTITION search type.
unsigned int source_var_thresh;
int frames_till_next_var_check;
@@ -973,6 +1039,29 @@ typedef struct VP9_COMP {
EXT_RATECTRL ext_ratectrl;
int fixed_qp_onepass;
+
+ // Flag to keep track of dynamic change in deadline mode
+ // (good/best/realtime).
+ MODE deadline_mode_previous_frame;
+
+ // Flag to disable scene detection when rtc rate control library is used.
+ int disable_scene_detection_rtc_ratectrl;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ /*!
+ * component_time[] are initialized to zero while encoder starts.
+ */
+ uint64_t component_time[kTimingComponents];
+ /*!
+ * Stores timing for individual components between calls of start_timing()
+ * and end_timing().
+ */
+ struct vpx_usec_timer component_timer[kTimingComponents];
+ /*!
+ * frame_component_time[] are initialized to zero at beginning of each frame.
+ */
+ uint64_t frame_component_time[kTimingComponents];
+#endif
} VP9_COMP;
#if CONFIG_RATE_CTRL
@@ -983,7 +1072,7 @@ static INLINE void partition_info_init(struct VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
- CHECK_MEM_ERROR(cm, cpi->partition_info,
+ CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
(PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
sizeof(PARTITION_INFO)));
memset(cpi->partition_info, 0,
@@ -998,8 +1087,8 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) {
}
static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) {
- mv_info->ref_frame[0] = NONE;
- mv_info->ref_frame[1] = NONE;
+ mv_info->ref_frame[0] = NO_REF_FRAME;
+ mv_info->ref_frame[1] = NO_REF_FRAME;
mv_info->mv[0].as_int = INVALID_MV;
mv_info->mv[1].as_int = INVALID_MV;
}
@@ -1011,7 +1100,7 @@ static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
- CHECK_MEM_ERROR(cm, cpi->motion_vector_info,
+ CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
(MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
sizeof(MOTION_VECTOR_INFO)));
memset(cpi->motion_vector_info, 0,
@@ -1030,7 +1119,7 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) {
static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
CHECK_MEM_ERROR(
- cm, cpi->tpl_stats_info,
+ &cm->error, cpi->tpl_stats_info,
(TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
}
@@ -1049,7 +1138,7 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
- CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info,
+ CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
(MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
sizeof(MOTION_VECTOR_INFO)));
}
@@ -1154,8 +1243,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
int cols);
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
- VPX_SCALING vert_mode);
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+ VPX_SCALING_MODE vert_mode);
int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
unsigned int height);
@@ -1296,6 +1385,14 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd);
+#else
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
@@ -1380,9 +1477,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[8], int delta_lf[8],
- int skip[8], int ref_frame[8]);
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[8], int delta_lf[8], int skip[8],
+ int ref_frame[8]);
void vp9_new_framerate(VP9_COMP *cpi, double framerate);
@@ -1392,6 +1490,171 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+ RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+ if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+ new_fb_ptr->mi_cols < cm->mi_cols) {
+ vpx_free(new_fb_ptr->mvs);
+ CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
+ (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(*new_fb_ptr->mvs)));
+ new_fb_ptr->mi_rows = cm->mi_rows;
+ new_fb_ptr->mi_cols = cm->mi_cols;
+ }
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+ int *const comp_cost[2]) {
+ assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+ assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
+ return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+ comp_cost[1][mv->col];
+}
+
+static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv,
+ const MV *ref, int sad_per_bit) {
+ MV diff;
+ diff.row = mv->row - ref->row;
+ diff.col = mv->col - ref->col;
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+ VP9_PROB_COST_SHIFT);
+}
+
+static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full,
+ const MV *ref_mv_full,
+ vpx_sad_fn_t sad_fn_ptr, int sadpb) {
+ const int src_buf_stride = x->plane[0].src.stride;
+ const uint8_t *const src_buf = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pred_buf_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *const pred_buf =
+ xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+ uint32_t start_mv_sad =
+ sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+ start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb);
+
+ return start_mv_sad;
+}
+
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+ int subsampling_dim, int blk_dim) {
+ return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+ const struct macroblockd_plane *const pd,
+ const int16_t *diff, const int diff_stride,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ int *visible_width, int *visible_height) {
+ int64_t sse;
+ const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+ const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+ const int b4x4s_to_right_edge = num_4x4_to_edge(
+ plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+ const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+ plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+ if (tx_bsize == BLOCK_4X4 ||
+ (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+ assert(tx_4x4_w == tx_4x4_h);
+ sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+ *visible_width = tx_4x4_w << 2;
+ *visible_height = tx_4x4_h << 2;
+ } else {
+ int r, c;
+ const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+ const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+ sse = 0;
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_r; ++r) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_c; ++c) {
+ sse += (int64_t)vpx_sum_squares_2d_i16(
+ diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+ }
+ }
+ *visible_width = max_c << 2;
+ *visible_height = max_r << 2;
+ }
+ return sse;
+}
+
+// Check if trellis coefficient optimization of the transform block is enabled.
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+ const int16_t *src_diff, int diff_stride,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ const struct encode_b_args *const args = (struct encode_b_args *)arg;
+ const MACROBLOCK *const x = args->x;
+
+ switch (args->enable_trellis_opt) {
+ case DISABLE_TRELLIS_OPT: return 0;
+ case ENABLE_TRELLIS_OPT: return 1;
+ case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
+ vpx_clear_system_state();
+
+ return (args->trellis_opt_thresh > 0.0)
+ ? (x->log_block_src_var <= args->trellis_opt_thresh)
+ : 1;
+ }
+ case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+ const int dequant_shift = 3;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const int qstep = pd->dequant[1] >> dequant_shift;
+ int *sse_calc_done = args->sse_calc_done;
+ int64_t *sse = args->sse;
+ int visible_width = 0, visible_height = 0;
+
+ // TODO: Enable the sf for high bit-depth case
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+ !sse_calc_done)
+ return 1;
+
+ *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+ blk_col, plane_bsize, tx_bsize, &visible_width,
+ &visible_height);
+ *sse_calc_done = 1;
+
+ vpx_clear_system_state();
+
+ return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+ qstep * args->trellis_opt_thresh);
+ }
+ default: assert(0 && "Invalid trellis optimization method."); return 1;
+ }
+}
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+ vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+ vpx_usec_timer_mark(&cpi->component_timer[component]);
+ cpi->frame_component_time[component] +=
+ vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+ switch (type) {
+ case 0: return "KEY_FRAME";
+ case 1: return "INTER_FRAME";
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 453fe2e0d..681996d33 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -94,10 +94,10 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
vp9_encode_free_mt_data(cpi);
- CHECK_MEM_ERROR(cm, cpi->workers,
+ CHECK_MEM_ERROR(&cm->error, cpi->workers,
vpx_malloc(num_workers * sizeof(*cpi->workers)));
- CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
for (i = 0; i < num_workers; i++) {
@@ -111,7 +111,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
thread_data->cpi = cpi;
// Allocate thread data.
- CHECK_MEM_ERROR(cm, thread_data->td,
+ CHECK_MEM_ERROR(&cm->error, thread_data->td,
vpx_memalign(32, sizeof(*thread_data->td)));
vp9_zero(*thread_data->td);
@@ -121,7 +121,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
vp9_setup_pc_tree(cm, thread_data->td);
// Allocate frame counters in thread data.
- CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
vpx_calloc(1, sizeof(*thread_data->td->counts)));
// Create threads
@@ -265,6 +265,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+ tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count;
tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
@@ -292,7 +293,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
{
int i;
- CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
if (row_mt_sync->mutex) {
for (i = 0; i < rows; ++i) {
@@ -300,7 +301,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
}
}
- CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
if (row_mt_sync->cond) {
for (i = 0; i < rows; ++i) {
@@ -310,7 +311,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
}
#endif // CONFIG_MULTITHREAD
- CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
// Set up nsync.
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 1d440442b..4664e8c5e 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -8,10 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stddef.h>
+
#include "vp9/encoder/vp9_ext_ratectrl.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/common/vp9_common.h"
#include "vpx_dsp/psnr.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
if (ext_ratectrl == NULL) {
@@ -92,6 +97,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
rc_frame_stats->mv_in_out_count = stats->mv_in_out_count;
rc_frame_stats->duration = stats->duration;
rc_frame_stats->count = stats->count;
+ rc_frame_stats->new_mv_count = stats->new_mv_count;
}
vpx_codec_err_t vp9_extrc_send_firstpass_stats(
@@ -118,6 +124,21 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats(
return VPX_CODEC_OK;
}
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+ const VpxTplGopStats *tpl_gop_stats) {
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) {
+ vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats(
+ ext_ratectrl->model, tpl_gop_stats);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ }
+ return VPX_CODEC_OK;
+}
+
static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
// TODO(angiebird): Add unit test to make sure this function behaves like
// get_frame_type_from_update_type()
@@ -131,7 +152,6 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
default:
fprintf(stderr, "Unsupported update_type %d\n", update_type);
abort();
- return 1;
}
}
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 7c3875883..b04580c1d 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -12,6 +12,7 @@
#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
#include "vp9/encoder/vp9_firstpass.h"
typedef struct EXT_RATECTRL {
@@ -34,6 +35,9 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
vpx_codec_err_t vp9_extrc_send_firstpass_stats(
EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+ const VpxTplGopStats *tpl_gop_stats);
+
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e9250e25c..a9cdf5353 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -152,6 +152,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
section->pcnt_intra_high = 0.0;
section->inactive_zone_rows = 0.0;
section->inactive_zone_cols = 0.0;
+ section->new_mv_count = 0.0;
section->MVr = 0.0;
section->mvr_abs = 0.0;
section->MVc = 0.0;
@@ -183,6 +184,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
section->pcnt_intra_high += frame->pcnt_intra_high;
section->inactive_zone_rows += frame->inactive_zone_rows;
section->inactive_zone_cols += frame->inactive_zone_cols;
+ section->new_mv_count += frame->new_mv_count;
section->MVr += frame->MVr;
section->mvr_abs += frame->mvr_abs;
section->MVc += frame->MVc;
@@ -212,6 +214,7 @@ static void subtract_stats(FIRSTPASS_STATS *section,
section->pcnt_intra_high -= frame->pcnt_intra_high;
section->inactive_zone_rows -= frame->inactive_zone_rows;
section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->new_mv_count -= frame->new_mv_count;
section->MVr -= frame->MVr;
section->mvr_abs -= frame->mvr_abs;
section->MVc -= frame->MVc;
@@ -361,7 +364,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
case BLOCK_8X16: return vpx_highbd_8_mse8x16;
default: return vpx_highbd_8_mse16x16;
}
- break;
case 10:
switch (bsize) {
case BLOCK_8X8: return vpx_highbd_10_mse8x8;
@@ -369,7 +371,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
case BLOCK_8X16: return vpx_highbd_10_mse8x16;
default: return vpx_highbd_10_mse16x16;
}
- break;
case 12:
switch (bsize) {
case BLOCK_8X8: return vpx_highbd_12_mse8x8;
@@ -377,7 +378,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
case BLOCK_8X16: return vpx_highbd_12_mse8x16;
default: return vpx_highbd_12_mse16x16;
}
- break;
}
}
@@ -435,6 +435,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+ MV center_mv_full = ref_mv_full;
+ unsigned int start_mv_sad;
+ vp9_sad_fn_ptr_t sad_fn_ptr;
int step_param = 3;
int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -455,10 +458,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+ // Calculate SAD of the start mv
+ clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
+ cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+ sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+ sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
+
// Center the initial step/diamond search on best mv.
- tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
- step_param, x->sadperbit16, &num00,
- &v_fn_ptr, ref_mv);
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
+ &tmp_mv, step_param, x->sadperbit16, &num00,
+ &sad_fn_ptr, ref_mv);
if (tmp_err < INT_MAX)
tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -478,9 +489,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
if (num00) {
--num00;
} else {
- tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
- step_param + n, x->sadperbit16, &num00,
- &v_fn_ptr, ref_mv);
+ tmp_err = cpi->diamond_search_sad(
+ x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
+ x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
if (tmp_err < INT_MAX)
tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
if (tmp_err < INT_MAX - new_mv_mode_penalty)
@@ -595,11 +606,11 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
#define FP_MAX_DN_THRESH 24
#define KERNEL_SIZE 3
-// Baseline Kernal weights for first pass noise metric
-static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
+// Baseline Kernel weights for first pass noise metric
+static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
2, 1, 2, 1 };
-// Estimate noise at a single point based on the impace of a spatial kernal
+// Estimate noise at a single point based on the impact of a spatial kernel
// on the point value
static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
int sum_weight = 0;
@@ -609,23 +620,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
int diff;
int dn_diff;
uint8_t *tmp_ptr;
- uint8_t *kernal_ptr;
+ uint8_t *kernel_ptr;
uint8_t dn_val;
uint8_t centre_val = *src_ptr;
- kernal_ptr = fp_dn_kernal_3;
+ kernel_ptr = fp_dn_kernel_3;
- // Apply the kernal
+ // Apply the kernel
tmp_ptr = src_ptr - stride - 1;
for (i = 0; i < KERNEL_SIZE; ++i) {
for (j = 0; j < KERNEL_SIZE; ++j) {
diff = abs((int)centre_val - (int)tmp_ptr[j]);
max_diff = VPXMAX(max_diff, diff);
if (diff <= FP_DN_THRESH) {
- sum_weight += *kernal_ptr;
- sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr;
+ sum_weight += *kernel_ptr;
+ sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr;
}
- ++kernal_ptr;
+ ++kernel_ptr;
}
tmp_ptr += stride;
}
@@ -651,13 +662,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
int dn_diff;
uint8_t *tmp_ptr;
uint16_t *tmp_ptr16;
- uint8_t *kernal_ptr;
+ uint8_t *kernel_ptr;
uint16_t dn_val;
uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr);
- kernal_ptr = fp_dn_kernal_3;
+ kernel_ptr = fp_dn_kernel_3;
- // Apply the kernal
+ // Apply the kernel
tmp_ptr = src_ptr - stride - 1;
for (i = 0; i < KERNEL_SIZE; ++i) {
tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr);
@@ -665,10 +676,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
diff = abs((int)centre_val - (int)tmp_ptr16[j]);
max_diff = VPXMAX(max_diff, diff);
if (diff <= FP_DN_THRESH) {
- sum_weight += *kernal_ptr;
- sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr;
+ sum_weight += *kernel_ptr;
+ sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr;
}
- ++kernal_ptr;
+ ++kernel_ptr;
}
tmp_ptr += stride;
}
@@ -793,6 +804,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
fps->inactive_zone_cols = (double)0;
if (fp_acc_data->mvcount > 0) {
+ fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs;
fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
@@ -809,6 +821,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
(double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
} else {
+ fps->new_mv_count = 0.0;
fps->MVr = 0.0;
fps->mvr_abs = 0.0;
fps->MVc = 0.0;
@@ -834,6 +847,7 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+ this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count;
this_tile->fp_data.mvcount += fp_acc_data->mvcount;
this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
@@ -904,6 +918,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
double mb_neutral_count;
int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
+ MV *first_top_mv = &tile_data->firstpass_top_mv;
+ MV last_nonzero_mv = { 0, 0 };
+
// First pass code requires valid last and new frame buffers.
assert(new_yv12 != NULL);
assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -944,6 +961,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
(*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
+ if (mb_col == mb_col_start) {
+ last_nonzero_mv = *first_top_mv;
+ }
+
// Adjust to the next column of MBs.
x->plane[0].src.buf = cpi->Source->y_buffer +
mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
@@ -1253,7 +1274,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
xd->mi[0]->mv[0].as_mv = mv;
xd->mi[0]->tx_size = TX_4X4;
xd->mi[0]->ref_frame[0] = LAST_FRAME;
- xd->mi[0]->ref_frame[1] = NONE;
+ xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
vp9_encode_sby_pass1(x, bsize);
fp_acc_data->sum_mvr += mv.row;
@@ -1268,6 +1289,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
if (!is_zero_mv(&mv)) {
++(fp_acc_data->mvcount);
+ if (!is_equal_mv(&mv, &last_nonzero_mv)) {
+ ++(fp_acc_data->new_mv_count);
+ }
+ last_nonzero_mv = mv;
// Does the row vector point inwards or outwards?
if (mb_row < cm->mb_rows / 2) {
@@ -1323,6 +1348,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
}
fp_acc_data->coded_error += (int64_t)this_error;
+ if (mb_col == mb_col_start) {
+ *first_top_mv = last_nonzero_mv;
+ }
recon_yoffset += 16;
recon_uvoffset += uv_mb_height;
@@ -1345,7 +1373,7 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
MV best_ref_mv;
// Tiling is ignored in the first pass.
vp9_tile_init(tile, cm, 0, 0);
-
+ tile_data.firstpass_top_mv = zero_mv;
#if CONFIG_RATE_CTRL
if (cpi->oxcf.use_simple_encode_api) {
fp_motion_vector_info_reset(cpi->frame_info.frame_width,
@@ -1411,7 +1439,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
CHECK_MEM_ERROR(
- cm, cpi->twopass.fp_mb_float_stats,
+ &cm->error, cpi->twopass.fp_mb_float_stats,
vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
{
@@ -1437,7 +1465,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
}
- // Dont allow a value of 0 for duration.
+ // Don't allow a value of 0 for duration.
// (Section duration is also defaulted to minimum of 1.0).
fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
@@ -1447,7 +1475,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
accumulate_stats(&twopass->total_stats, &fps);
}
- // Copy the previous Last Frame back into gf and and arf buffers if
+ // Copy the previous Last Frame back into gf and arf buffers if
// the prediction is good enough... but also don't allow it to lag too far.
if ((twopass->sr_update_lag > 3) ||
((cm->current_video_frame > 0) &&
@@ -1476,22 +1504,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
cm->ref_frame_map[cpi->lst_fb_idx]);
}
- // Use this to see what the first pass reconstruction looks like.
- if (0) {
- char filename[512];
- FILE *recon_file;
- snprintf(filename, sizeof(filename), "enc%04d.yuv",
- (int)cm->current_video_frame);
-
- if (cm->current_video_frame == 0)
- recon_file = fopen(filename, "wb");
- else
- recon_file = fopen(filename, "ab");
-
- (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
- fclose(recon_file);
- }
-
// In the first pass, every frame is considered as a show frame.
update_frame_indexes(cm, /*show_frame=*/1);
if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
@@ -1664,7 +1676,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// Scan the first pass file and calculate a modified score for each
// frame that is used to distribute bits. The modified score is assumed
- // to provide a linear basis for bit allocation. I.e a frame A with a score
+ // to provide a linear basis for bit allocation. I.e., a frame A with a score
// that is double that of frame B will be allocated 2x as many bits.
{
double modified_score_total = 0.0;
@@ -1689,8 +1701,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
}
// Second scan using clamps based on the previous cycle average.
- // This may modify the total and average somewhat but we dont bother with
- // further itterations.
+ // This may modify the total and average somewhat but we don't bother with
+ // further iterations.
modified_score_total = 0.0;
s = twopass->stats_in;
while (s < twopass->stats_in_end) {
@@ -1847,7 +1859,7 @@ static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) {
// brief break in prediction (such as a flash) but subsequent frames
// are reasonably well predicted by an earlier (pre flash) frame.
// The recovery after a flash is indicated by a high pcnt_second_ref
- // useage or a second ref coded error notabley lower than the last
+ // usage or a second ref coded error notabley lower than the last
// frame coded error.
if (frame_stats == NULL) {
return 0;
@@ -2027,7 +2039,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
&abs_mv_in_out_accumulator, &mv_ratio_accumulator);
- // We want to discount the the flash frame itself and the recovery
+ // We want to discount the flash frame itself and the recovery
// frame that follows as both will have poor scores.
flash_detected = detect_flash_from_frame_stats(this_frame) ||
detect_flash_from_frame_stats(next_frame);
@@ -2158,7 +2170,7 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score,
double score_total = 0.0;
int i = 0;
- // We dont ever want to return a 0 score here.
+ // We don't ever want to return a 0 score here.
if (frame_count == 0) return 1.0;
while ((i < frame_count) && (s < twopass->stats_in_end)) {
@@ -2492,7 +2504,7 @@ static int get_gop_coding_frame_num(
int *use_alt_ref, const FRAME_INFO *frame_info,
const TWO_PASS *const twopass, const RATE_CONTROL *rc,
int gf_start_show_idx, const RANGE *active_gf_interval,
- double gop_intra_factor, int lag_in_frames) {
+ double gop_intra_factor, int lag_in_frames, int *end_of_sequence) {
const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
double loop_decay_rate = 1.00;
double mv_ratio_accumulator = 0.0;
@@ -2518,6 +2530,7 @@ static int get_gop_coding_frame_num(
next_frame = fps_get_frame_stats(first_pass_info,
gf_start_show_idx + gop_coding_frames);
if (next_frame == NULL) {
+ *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active;
break;
}
@@ -2586,7 +2599,7 @@ static int get_gop_coding_frame_num(
if (
// Don't break out with a very short interval.
(gop_coding_frames >= active_gf_interval->min) &&
- // If possible dont break very close to a kf
+ // If possible don't break very close to a kf
((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) &&
(gop_coding_frames & 0x01) && (!flash_detected) &&
((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
@@ -2708,6 +2721,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
double gop_intra_factor;
int gop_frames;
RANGE active_gf_interval;
+ // Whether this is at the end of last GOP of this sequence.
+ int end_of_sequence = 0;
// Reset the GF group data structures unless this is a key
// frame in which case it will already have been done.
@@ -2739,7 +2754,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
gop_coding_frames = get_gop_coding_frame_num(
&use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
- &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
+ &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames,
+ &end_of_sequence);
use_alt_ref &= allow_alt_ref;
#if CONFIG_RATE_CTRL
// If the external gop_command is on, we will override the decisions
@@ -2757,7 +2773,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
// are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
// will be overwritten.
if (cpi->ext_ratectrl.ready &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
vpx_codec_err_t codec_status;
vpx_rc_gop_decision_t gop_decision;
vpx_rc_gop_info_t gop_info;
@@ -3020,7 +3037,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
// Return true the intra/inter ratio for the current frame is
- // low but better in the next and previous frame and the relative useage of
+ // low but better in the next and previous frame and the relative usage of
// intra in the current frame is markedly higher than the last and next frame.
if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
(next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
@@ -3041,8 +3058,8 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
// Minimum % intra coding observed in first pass (1.0 = 100%)
#define MIN_INTRA_LEVEL 0.25
// Threshold for use of the lagging second reference frame. Scene cuts do not
-// usually have a high second ref useage.
-#define SECOND_REF_USEAGE_THRESH 0.2
+// usually have a high second ref usage.
+#define SECOND_REF_USAGE_THRESH 0.2
// Hard threshold where the first pass chooses intra for almost all blocks.
// In such a case even if the frame is not a scene cut coding a key frame
// may be a good option.
@@ -3072,7 +3089,7 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
detect_flash_from_frame_stats(next_frame);
if (!detect_flash_from_frame_stats(this_frame) &&
!detect_flash_from_frame_stats(next_frame) &&
- (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) &&
((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
(slide_transition(this_frame, last_frame, next_frame)) ||
(intra_step_transition(this_frame, last_frame, next_frame)) ||
@@ -3350,7 +3367,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
// The second (lagging) ref error is not valid immediately after
// a key frame because either the lag has not built up (in the case of
- // the first key frame or it points to a refernce before the new key
+ // the first key frame or it points to a reference before the new key
// frame.
if (i < 2) sr_accumulator = 0.0;
frame_boost =
@@ -3380,7 +3397,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
start_position, twopass->stats_in_end, rc->frames_to_key);
- // Special case for static / slide show content but dont apply
+ // Special case for static / slide show content but don't apply
// if the kf group is very short.
if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
rc->kf_boost = (int)(twopass->kf_max_total_boost);
@@ -3494,8 +3511,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
FIRSTPASS_STATS this_frame;
const int show_idx = cm->current_video_frame;
- if (cpi->common.current_frame_coding_index == 0) {
- VP9_COMMON *cm = &cpi->common;
+ if (cpi->common.current_frame_coding_index == 0 &&
+ cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) {
const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
&cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
if (codec_status != VPX_CODEC_OK) {
@@ -3513,7 +3530,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
vp9_init_vizier_params(twopass, screen_area);
}
- // If this is an arf frame then we dont want to read the stats file or
+ // If this is an arf frame then we don't want to read the stats file or
// advance the input pointer as we already have what we need.
if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
int target_rate;
@@ -3792,6 +3809,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
RANGE active_gf_interval;
int arf_layers;
+ int end_of_sequence = 0;
if (oxcf->use_simple_encode_api) {
active_gf_interval = get_active_gf_inverval_range_simple(
rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
@@ -3809,9 +3827,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
gop_intra_factor = 1.0;
}
- frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc,
- show_idx, &active_gf_interval,
- gop_intra_factor, oxcf->lag_in_frames);
+ frame_count = get_gop_coding_frame_num(
+ use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval,
+ gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence);
*use_alt_ref &= allow_alt_ref;
return frame_count;
}
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index cdcf56872..a19b04db7 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -14,6 +14,7 @@
#include <assert.h>
#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/encoder/vp9_firstpass_stats.h"
#include "vp9/encoder/vp9_lookahead.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -55,37 +56,9 @@ typedef struct {
int64_t sum_mvcs;
int sum_in_vectors;
int intra_smooth_count;
+ int new_mv_count;
} FIRSTPASS_DATA;
-typedef struct {
- double frame;
- double weight;
- double intra_error;
- double coded_error;
- double sr_coded_error;
- double frame_noise_energy;
- double pcnt_inter;
- double pcnt_motion;
- double pcnt_second_ref;
- double pcnt_neutral;
- double pcnt_intra_low; // Coded intra but low variance
- double pcnt_intra_high; // Coded intra high variance
- double intra_skip_pct;
- double intra_smooth_pct; // % of blocks that are smooth
- double inactive_zone_rows; // Image mask rows top and bottom.
- double inactive_zone_cols; // Image mask columns at left and right edges.
- double MVr;
- double mvr_abs;
- double MVc;
- double mvc_abs;
- double MVrv;
- double MVcv;
- double mv_in_out_count;
- double duration;
- double count;
- int64_t spatial_layer_id;
-} FIRSTPASS_STATS;
-
typedef enum {
KF_UPDATE = 0,
LF_UPDATE = 1,
diff --git a/vp9/encoder/vp9_firstpass_stats.h b/vp9/encoder/vp9_firstpass_stats.h
new file mode 100644
index 000000000..01928e781
--- /dev/null
+++ b/vp9/encoder/vp9_firstpass_stats.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ double frame;
+ double weight;
+ double intra_error;
+ double coded_error;
+ double sr_coded_error;
+ double frame_noise_energy;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double pcnt_neutral;
+ double pcnt_intra_low; // Coded intra but low variance
+ double pcnt_intra_high; // Coded intra high variance
+ double intra_skip_pct;
+ double intra_smooth_pct; // % of blocks that are smooth
+ double inactive_zone_rows; // Image mask rows top and bottom.
+ double inactive_zone_cols; // Image mask columns at left and right edges.
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double duration;
+ double count;
+ double new_mv_count;
+ int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index a410d0407..ba550a1d6 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -12,6 +12,7 @@
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
#include "vpx_dsp/vpx_filter.h"
#include "vpx_scale/yv12config.h"
@@ -91,6 +92,23 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
{
const int dst_w = dst->y_crop_width;
const int dst_h = dst->y_crop_height;
+
+ // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires
+ // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it
+ // needs to call vp9_scale_and_extend_frame_nonnormative() that supports
+ // arbitrary scaling.
+ const int x_step_q4 = 16 * src_w / dst_w;
+ const int y_step_q4 = 16 * src_h / dst_h;
+ if (x_step_q4 > 64 || y_step_q4 > 64) {
+ // This function is only called while cm->bit_depth is VPX_BITS_8.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8);
+#else
+ vp9_scale_and_extend_frame_nonnormative(src, dst);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return;
+ }
+
for (i = 0; i < MAX_MB_PLANE; ++i) {
const int factor = (i == 0 || i == 3 ? 1 : 2);
const int src_stride = src_strides[i];
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 7c2790cb9..2f20a8fe6 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -98,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
// If the current best reference mv is not centered on 0,0 then do a 0,0
// based search as well.
if (ref_mv->row != 0 || ref_mv->col != 0) {
- unsigned int tmp_err;
- MV zero_ref_mv = { 0, 0 }, tmp_mv;
+ MV zero_ref_mv = { 0, 0 };
tmp_err =
do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);
@@ -238,7 +237,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
xd->mi[0] = &mi_local;
mi_local.sb_type = BLOCK_16X16;
mi_local.ref_frame[0] = LAST_FRAME;
- mi_local.ref_frame[1] = NONE;
+ mi_local.ref_frame[1] = NO_REF_FRAME;
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
MV gld_left_mv = gld_top_mv;
@@ -289,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
int *arf_not_zz;
CHECK_MEM_ERROR(
- cm, arf_not_zz,
+ &cm->error, arf_not_zz,
vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
// We are not interested in results beyond the alt ref itself.
@@ -334,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
}
}
- // Only bother with segmentation if over 10% of the MBs in static segment
- // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
- if (1) {
- // Note % of blocks that are marked as static
- if (cm->MBs)
- cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+ // Note % of blocks that are marked as static
+ if (cm->MBs)
+ cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
- // This error case should not be reachable as this function should
- // never be called with the common data structure uninitialized.
- else
- cpi->static_mb_pct = 0;
-
- vp9_enable_segmentation(&cm->seg);
- } else {
+ // This error case should not be reachable as this function should
+ // never be called with the common data structure uninitialized.
+ else
cpi->static_mb_pct = 0;
- vp9_disable_segmentation(&cm->seg);
- }
+
+ vp9_enable_segmentation(&cm->seg);
// Free localy allocated storage
vpx_free(arf_not_zz);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 1f08aa5de..cbe1c4029 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -77,14 +77,6 @@ int vp9_init_search_range(int size) {
return sr;
}
-static INLINE int mv_cost(const MV *mv, const int *joint_cost,
- int *const comp_cost[2]) {
- assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
- assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
- return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
- comp_cost[1][mv->col];
-}
-
int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
int *mvcost[2], int weight) {
const MV diff = { mv->row - ref->row, mv->col - ref->col };
@@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
}
return 0;
}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
- int sad_per_bit) {
- const MV diff = { mv->row - ref->row, mv->col - ref->col };
- return ROUND_POWER_OF_TWO(
- (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
- VP9_PROB_COST_SHIFT);
-}
-
void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
int len;
int ss_count = 0;
@@ -163,8 +146,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
do { \
if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
int64_t tmpmse; \
- const MV mv = { r, c }; \
- const MV ref_mv = { rr, rc }; \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
if (second_pred == NULL) { \
thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
src_stride, &sse); \
@@ -173,7 +156,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
src_stride, &sse, second_pred); \
} \
tmpmse = thismse; \
- tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \
+ tmpmse += \
+ mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \
if (tmpmse >= INT_MAX) { \
v = INT_MAX; \
} else if ((v = (uint32_t)tmpmse) < besterr) { \
@@ -192,15 +176,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
#define CHECK_BETTER(v, r, c) \
do { \
if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
- const MV mv = { r, c }; \
- const MV ref_mv = { rr, rc }; \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
if (second_pred == NULL) \
thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
src_stride, &sse); \
else \
thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
src_stride, &sse, second_pred); \
- if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \
+ if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \
+ error_per_bit) + \
thismse) < besterr) { \
besterr = v; \
br = r; \
@@ -312,7 +297,7 @@ static unsigned int setup_center_error(
besterr =
vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
} else {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
}
@@ -327,7 +312,7 @@ static unsigned int setup_center_error(
uint32_t besterr;
(void)xd;
if (second_pred != NULL) {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
} else {
@@ -650,7 +635,7 @@ static int accurate_sub_pel_search(
vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
0, kernel, MV_PRECISION_Q3, 0, 0);
if (second_pred != NULL) {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
} else {
@@ -669,7 +654,7 @@ static int accurate_sub_pel_search(
vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
0, kernel, MV_PRECISION_Q3, 0, 0);
if (second_pred != NULL) {
- DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
} else {
@@ -686,13 +671,14 @@ static int accurate_sub_pel_search(
do { \
if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
int64_t tmpmse; \
- const MV mv = { r, c }; \
- const MV ref_mv = { rr, rc }; \
- thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
+ thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
src_stride, y, y_stride, second_pred, \
w, h, &sse); \
tmpmse = thismse; \
- tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \
+ tmpmse += \
+ mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \
if (tmpmse >= INT_MAX) { \
v = INT_MAX; \
} else if ((v = (uint32_t)tmpmse) < besterr) { \
@@ -711,12 +697,13 @@ static int accurate_sub_pel_search(
#define CHECK_BETTER1(v, r, c) \
do { \
if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
- const MV mv = { r, c }; \
- const MV ref_mv = { rr, rc }; \
- thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
+ thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
src_stride, y, y_stride, second_pred, \
w, h, &sse); \
- if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \
+ if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \
+ error_per_bit) + \
thismse) < besterr) { \
besterr = v; \
br = r; \
@@ -966,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
}
#define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale
+#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale
#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
// Calculate and return a sad+mvcost list around an integer best pel.
@@ -980,16 +967,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
int br = best_mv->row;
int bc = best_mv->col;
- MV this_mv;
+ const MV mv = { br, bc };
int i;
unsigned int sse;
- this_mv.row = br;
- this_mv.col = bc;
cost_list[0] =
- fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+ fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
in_what->stride, &sse) +
- mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ mvsad_err_cost(x, &mv, &fcenter_mv, sadpb);
if (check_bounds(&x->mv_limits, br, bc, 1)) {
for (i = 0; i < 4; i++) {
const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
@@ -1049,7 +1034,7 @@ static int vp9_pattern_search(
in_what->stride) +
mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
- // Search all possible scales upto the search param around the center point
+ // Search all possible scales up to the search param around the center point
// pick the scale of the point that is best as the starting scale of
// further steps around it.
if (do_init_search) {
@@ -1170,6 +1155,9 @@ static int vp9_pattern_search(
} while (s--);
}
+ best_mv->row = br;
+ best_mv->col = bc;
+
// Returns the one-away integer pel sad values around the best as follows:
// cost_list[0]: cost at the best integer pel
// cost_list[1]: cost at delta {0, -1} (left) from the best integer pel
@@ -1177,11 +1165,8 @@ static int vp9_pattern_search(
// cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel
// cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel
if (cost_list) {
- const MV best_mv = { br, bc };
- calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+ calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list);
}
- best_mv->row = br;
- best_mv->col = bc;
return bestsad;
}
@@ -1223,7 +1208,7 @@ static int vp9_pattern_search_sad(
in_what->stride) +
mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
- // Search all possible scales upto the search param around the center point
+ // Search all possible scales up to the search param around the center point
// pick the scale of the point that is best as the starting scale of
// further steps around it.
if (do_init_search) {
@@ -2068,9 +2053,9 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
#endif // CONFIG_NON_GREEDY_MV
int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
- MV *ref_mv, MV *best_mv, int search_param,
- int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
int i, j, step;
@@ -2081,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
const int in_what_stride = xd->plane[0].pre[0].stride;
const uint8_t *best_address;
- unsigned int bestsad = INT_MAX;
+ unsigned int bestsad = start_mv_sad;
int best_site = -1;
int last_site = -1;
@@ -2099,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
const int tot_steps = cfg->total_steps - search_param;
const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
- x->mv_limits.row_min, x->mv_limits.row_max);
ref_row = ref_mv->row;
ref_col = ref_mv->col;
*num00 = 0;
@@ -2111,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
best_address = in_what;
- // Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
- mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
i = 0;
for (step = 0; step < tot_steps; step++) {
@@ -2138,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
- fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
- sad_array);
+ sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
for (t = 0; t < 4; t++, i++) {
if (sad_array[t] < bestsad) {
@@ -2163,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
if (is_mv_in(&x->mv_limits, &this_mv)) {
const uint8_t *const check_here = ss_os[i] + best_address;
unsigned int thissad =
- fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+ sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad) {
thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -2321,17 +2300,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
// TODO(jingning): Implement integral projection functions for high bit-depth
// setting and remove this part of code.
if (xd->bd != 8) {
- unsigned int this_sad;
+ const unsigned int sad = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride);
tmp_mv->row = 0;
tmp_mv->col = 0;
- this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
- xd->plane[0].pre[0].buf, ref_stride);
if (scaled_ref_frame) {
int i;
for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
}
- return this_sad;
+ return sad;
}
#endif
@@ -2506,15 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
point as the best match, we will do a final 1-away diamond
refining search */
static int full_pixel_diamond(const VP9_COMP *const cpi,
- const MACROBLOCK *const x, MV *mvp_full,
- int step_param, int sadpb, int further_steps,
- int do_refine, int *cost_list,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ int use_downsampled_sad, int *cost_list,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv) {
MV temp_mv;
int thissme, n, num00 = 0;
- int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
- step_param, sadpb, &n, fn_ptr, ref_mv);
+ int bestsme;
+ const int src_buf_stride = x->plane[0].src.stride;
+ const uint8_t *const src_buf = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pred_buf_stride = xd->plane[0].pre[0].stride;
+ uint8_t *pred_buf;
+ vp9_sad_fn_ptr_t sad_fn_ptr;
+ unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
+ const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+
+ pred_buf =
+ xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+ start_mv_sad_even_rows =
+ fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+ start_mv_sad_odd_rows =
+ fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+ pred_buf + pred_buf_stride, pred_buf_stride);
+ start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+ start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+ sad_fn_ptr.sdf = fn_ptr->sdf;
+ sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+ if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+ // If the absolute difference between the pred-to-src SAD of even rows and
+ // the pred-to-src SAD of odd rows is small, skip every other row in sad
+ // computation.
+ const int odd_to_even_diff_sad =
+ abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+ const int mult_thresh = 10;
+ if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+ sad_fn_ptr.sdf = fn_ptr->sdsf;
+ sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+ }
+ }
+
+ bestsme =
+ cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
+ step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
if (bestsme < INT_MAX)
bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
*dst_mv = temp_mv;
@@ -2529,9 +2546,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
if (num00) {
num00--;
} else {
- thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
- step_param + n, sadpb, &num00, fn_ptr,
- ref_mv);
+ thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
+ &temp_mv, step_param + n, sadpb, &num00,
+ &sad_fn_ptr, ref_mv);
if (thissme < INT_MAX)
thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
@@ -2549,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
if (do_refine) {
const int search_range = 8;
MV best_mv = *dst_mv;
- thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
- ref_mv);
+ thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+ &sad_fn_ptr, ref_mv);
if (thissme < INT_MAX)
thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
if (thissme < bestsme) {
@@ -2559,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
}
}
+ if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+ // If we are skipping rows when we perform the motion search, we need to
+ // check the quality of skipping. If it's bad, then we run search with
+ // skip row features off.
+ const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+ const int sad =
+ fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+ const int skip_sad =
+ fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+ // We will keep the result of skipping rows if it's good enough.
+ const int kSADThresh =
+ 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+ // There is a large discrepancy between skipping and not skipping, so we
+ // need to redo the motion search.
+ return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+ further_steps, do_refine, 0, cost_list, fn_ptr,
+ ref_mv, dst_mv);
+ }
+ }
+
// Return cost list.
if (cost_list) {
calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -2710,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
int search_range,
- const vp9_variance_fn_ptr_t *fn_ptr,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2719,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
unsigned int best_sad =
- fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+ sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
int i, j;
@@ -2736,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
best_address - 1, best_address + 1,
best_address + in_what->stride };
- fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+ sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+ sads);
for (j = 0; j < 4; ++j) {
if (sads[j] < best_sad) {
@@ -2756,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
if (is_mv_in(&x->mv_limits, &mv)) {
unsigned int sad =
- fn_ptr->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &mv), in_what->stride);
+ sad_fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
if (sad < best_sad) {
sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
if (sad < best_sad) {
@@ -2874,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
break;
case NSTEP:
case MESH:
- var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
- MAX_MVSEARCH_STEPS - 1 - step_param, 1,
- cost_list, fn_ptr, ref_mv, tmp_mv);
+ var = full_pixel_diamond(
+ cpi, x, bsize, mvp_full, step_param, error_per_bit,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+ cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
break;
default: assert(0 && "Unknown search method");
}
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index bdaf2ce77..fd6a8b9ac 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -41,6 +41,11 @@ typedef struct search_site_config {
int total_steps;
} search_site_config;
+typedef struct vp9_sad_table {
+ vpx_sad_fn_t sdf;
+ vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
const MV *mv) {
return &buf->buf[mv->row * buf->stride + mv->col];
@@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
struct VP9_COMP;
struct SPEED_FEATURES;
+struct vp9_sad_table;
int vp9_init_search_range(int size);
int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
int error_per_bit, int search_range,
- const struct vp9_variance_vtable *fn_ptr,
+ const struct vp9_sad_table *sad_fn_ptr,
const struct mv *center_mv);
// Perform integral projection based motion estimation.
@@ -94,9 +100,9 @@ extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
typedef int (*vp9_diamond_search_fn_t)(
- const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
- int search_param, int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+ const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
+ uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
+ int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
int search_range,
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 45659f2a9..0843cd97e 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -59,7 +59,7 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
int i;
CHECK_MEM_ERROR(
- cm, this_tile->row_base_thresh_freq_fact,
+ &cm->error, this_tile->row_base_thresh_freq_fact,
(int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
sizeof(*(this_tile->row_base_thresh_freq_fact))));
for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
@@ -85,7 +85,7 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
multi_thread_ctxt->allocated_tile_rows = tile_rows;
multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
- CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+ CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
(JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
#if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 9696529c5..4ee6e51ba 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -202,7 +202,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
VPXMIN(cpi->consec_zero_mv[bl_index1],
VPXMIN(cpi->consec_zero_mv[bl_index2],
cpi->consec_zero_mv[bl_index3])));
- // Only consider blocks that are likely steady background. i.e, have
+ // Only consider blocks that are likely steady background. i.e., have
// been encoded as zero/low motion x (= thresh_consec_zeromv) frames
// in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
// 4 sub-blocks for 16x16 block. And exclude this frame if
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 579b466ca..6f2524b36 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -566,23 +566,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
// Transform skipping test in UV planes.
for (i = 1; i <= 2; i++) {
- struct macroblock_plane *const p = &x->plane[i];
- struct macroblockd_plane *const pd = &xd->plane[i];
- const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+ struct macroblock_plane *const p_uv = &x->plane[i];
+ struct macroblockd_plane *const pd_uv = &xd->plane[i];
+ const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv);
const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
- const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv);
const int uv_bw = b_width_log2_lookup[uv_bsize];
const int uv_bh = b_height_log2_lookup[uv_bsize];
const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
(uv_bh - b_height_log2_lookup[unit_size]);
- const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
- const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+ const uint32_t uv_dc_thr =
+ pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf);
+ const uint32_t uv_ac_thr =
+ pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf);
int j = i - 1;
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
flag_preduv_computed[i - 1] = 1;
- var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
- p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+ var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride,
+ pd_uv->dst.buf, pd_uv->dst.stride,
+ &sse_uv[j]);
if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
(sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
@@ -768,7 +771,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
for (r = 0; r < max_blocks_high; r += block_step) {
for (c = 0; c < num_4x4_w; c += block_step) {
if (c < max_blocks_wide) {
- const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -783,22 +786,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
switch (tx_size) {
case TX_16X16:
vpx_hadamard_16x16(src_diff, diff_stride, coeff);
- vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
case TX_8X8:
vpx_hadamard_8x8(src_diff, diff_stride, coeff);
- vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
default:
assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
- vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
- dqcoeff, pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
+ vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
break;
}
*skippable &= (*eob == 0);
@@ -1395,8 +1395,8 @@ static void recheck_zeromv_after_denoising(
RD_COST this_rdc;
mi->mode = ZEROMV;
mi->ref_frame[0] = LAST_FRAME;
- mi->ref_frame[1] = NONE;
- set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+ mi->ref_frame[1] = NO_REF_FRAME;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
mi->mv[0].as_int = 0;
mi->interp_filter = EIGHTTAP;
if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
@@ -1414,7 +1414,7 @@ static void recheck_zeromv_after_denoising(
this_rdc = *best_rdc;
mi->mode = ctx_den->best_mode;
mi->ref_frame[0] = ctx_den->best_ref_frame;
- set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
mi->interp_filter = ctx_den->best_pred_filter;
if (ctx_den->best_ref_frame == INTRA_FRAME) {
mi->mv[0].as_int = INVALID_MV;
@@ -1678,7 +1678,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
bp->best_intra_tx_size = TX_SIZES;
bp->best_pred_filter = EIGHTTAP;
bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
- bp->best_second_ref_frame = NONE;
+ bp->best_second_ref_frame = NO_REF_FRAME;
bp->best_pred = NULL;
}
@@ -1872,8 +1872,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
vp9_rd_cost_reset(&best_rdc);
vp9_rd_cost_reset(rd_cost);
mi->sb_type = bsize;
- mi->ref_frame[0] = NONE;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[0] = NO_REF_FRAME;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->tx_size =
VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -1933,15 +1933,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
svc->spatial_layer_id > 0 && !gf_temporal_ref) {
if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
- struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
- if (vp9_is_scaled(sf)) {
+ struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+ if (vp9_is_scaled(ref_sf)) {
svc_force_zero_mode[LAST_FRAME - 1] = 1;
inter_layer_ref = LAST_FRAME;
}
}
if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
- if (vp9_is_scaled(sf)) {
+ struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+ if (vp9_is_scaled(ref_sf)) {
svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
inter_layer_ref = GOLDEN_FRAME;
}
@@ -2051,7 +2051,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int comp_pred = 0;
int force_mv_inter_layer = 0;
PREDICTION_MODE this_mode;
- second_ref_frame = NONE;
+ second_ref_frame = NO_REF_FRAME;
if (idx < num_inter_modes) {
this_mode = ref_mode_set[idx].pred_mode;
@@ -2628,7 +2628,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
best_pickmode.best_mode = this_mode;
best_pickmode.best_intra_tx_size = mi->tx_size;
best_pickmode.best_ref_frame = INTRA_FRAME;
- best_pickmode.best_second_ref_frame = NONE;
+ best_pickmode.best_second_ref_frame = NO_REF_FRAME;
mi->uv_mode = this_mode;
mi->mv[0].as_int = INVALID_MV;
mi->mv[1].as_int = INVALID_MV;
@@ -2750,8 +2750,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
MODE_INFO *const mi = xd->mi[0];
MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
const struct segmentation *const seg = &cm->seg;
- MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
- MV_REFERENCE_FRAME best_ref_frame = NONE;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME;
+ MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME;
unsigned char segment_id = mi->segment_id;
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
int64_t best_rd = INT64_MAX;
@@ -2772,9 +2772,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
(yv12 != NULL)) {
int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
- const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
- vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
- sf);
+ const struct scale_factors *const ref_sf =
+ &cm->frame_refs[ref_frame - 1].sf;
+ vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf,
+ ref_sf);
vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
mbmi_ext->mode_context);
@@ -2789,7 +2790,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
mi->tx_size = TX_4X4;
mi->uv_mode = DC_PRED;
mi->ref_frame[0] = LAST_FRAME;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->interp_filter =
cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index dcc44449f..19edf166d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -15,6 +15,7 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/encoder/vp9_encoder.h"
@@ -22,12 +23,14 @@
#include "vp9/encoder/vp9_rd.h"
void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
int i, eob = -1;
- (void)iscan;
+ const int16_t *round_ptr = mb_plane->round_fp;
+ const int16_t *quant_ptr = mb_plane->quant_fp;
+ const int16_t *scan = scan_order->scan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -53,15 +56,15 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr,
- const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr,
+ const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
int i;
int eob = -1;
-
- (void)iscan;
+ const int16_t *round_ptr = mb_plane->round_fp;
+ const int16_t *quant_ptr = mb_plane->quant_fp;
+ const int16_t *scan = scan_order->scan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -86,12 +89,14 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
// TODO(jingning) Refactor this file and combine functions with similar
// operations.
void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
int i, eob = -1;
- (void)iscan;
+ const int16_t *round_ptr = mb_plane->round_fp;
+ const int16_t *quant_ptr = mb_plane->quant_fp;
+ const int16_t *scan = scan_order->scan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -118,13 +123,14 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_quantize_fp_32x32_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
- const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
int i, eob = -1;
-
- (void)iscan;
+ const int16_t *round_ptr = mb_plane->round_fp;
+ const int16_t *quant_ptr = mb_plane->quant_fp;
+ const int16_t *scan = scan_order->scan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -249,8 +255,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
// Y
x->plane[0].quant = quants->y_quant[qindex];
x->plane[0].quant_fp = quants->y_quant_fp[qindex];
- memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex],
- 8 * sizeof(*(x->plane[0].round_fp)));
+ x->plane[0].round_fp = quants->y_round_fp[qindex];
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
x->plane[0].zbin = quants->y_zbin[qindex];
x->plane[0].round = quants->y_round[qindex];
@@ -262,8 +267,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
for (i = 1; i < 3; i++) {
x->plane[i].quant = quants->uv_quant[qindex];
x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
- memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex],
- 8 * sizeof(*(x->plane[i].round_fp)));
+ x->plane[i].round_fp = quants->uv_round_fp[qindex];
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
x->plane[i].zbin = quants->uv_zbin[qindex];
x->plane[i].round = quants->uv_round[qindex];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d9207f7a2..6452e349d 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -260,7 +260,7 @@ void vp9_update_buffer_level_preencode(VP9_COMP *cpi) {
// for the layered rate control which involves cumulative buffer levels for
// the temporal layers. Allow for using the timestamp(pts) delta for the
// framerate when the set_ref_frame_config is used.
-static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
int i;
// Set this to 1 to use timestamp delta for "framerate" under
@@ -680,7 +680,8 @@ static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
else
q = qclamp;
}
- if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
vp9_cyclic_refresh_limit_q(cpi, &q);
return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
}
@@ -1150,8 +1151,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
if (frame_is_intra_only(cm)) {
if (oxcf->rc_mode == VPX_Q) {
int qindex = cq_level;
- double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
- int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth);
+ double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ int delta_qindex =
+ vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth);
active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
} else if (rc->this_key_frame_forced) {
// Handle the special case for key frames forced when we have reached
@@ -1195,7 +1197,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
} else {
q = rc->avg_frame_qindex[KEY_FRAME];
}
- // For constrained quality dont allow Q less than the cq level
+ // For constrained quality don't allow Q less than the cq level
if (oxcf->rc_mode == VPX_CQ) {
if (q < cq_level) q = cq_level;
@@ -1206,12 +1208,14 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
} else if (oxcf->rc_mode == VPX_Q) {
int qindex = cq_level;
- double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
int delta_qindex;
if (cpi->refresh_alt_ref_frame)
- delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+ delta_qindex =
+ vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth);
else
- delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+ delta_qindex =
+ vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth);
active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
} else {
active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
@@ -1219,11 +1223,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
} else {
if (oxcf->rc_mode == VPX_Q) {
int qindex = cq_level;
- double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
0.70, 1.0, 0.85, 1.0 };
int delta_qindex = vp9_compute_qdelta(
- rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+ rc, qstart,
+ qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
cm->bit_depth);
active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
} else {
@@ -1355,7 +1360,7 @@ static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
active_best_quality /= 4;
}
- // Dont allow the active min to be lossless (q0) unlesss the max q
+ // Don't allow the active min to be lossless (q0) unlesss the max q
// already indicates lossless.
active_best_quality =
VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
@@ -1453,7 +1458,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
} else {
q = active_worst_quality;
}
- // For constrained quality dont allow Q less than the cq level
+ // For constrained quality don't allow Q less than the cq level
if (oxcf->rc_mode == VPX_CQ) {
if (q < cq_level) q = cq_level;
}
@@ -1859,8 +1864,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
rc->avg_frame_qindex[KEY_FRAME] =
ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
if (cpi->use_svc) {
- int i = 0;
- SVC *svc = &cpi->svc;
+ int i;
for (i = 0; i < svc->number_temporal_layers; ++i) {
const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
svc->number_temporal_layers);
@@ -1988,6 +1992,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
svc->lower_layer_qindex = cm->base_qindex;
+ cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
}
void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -2008,6 +2013,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
}
+ cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
}
int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
@@ -2033,7 +2039,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
static const int kf_ratio = 25;
const RATE_CONTROL *rc = &cpi->rc;
- const int target = rc->avg_frame_bandwidth * kf_ratio;
+ int target = rc->avg_frame_bandwidth;
+ if (target > INT_MAX / kf_ratio)
+ target = INT_MAX;
+ else
+ target = rc->avg_frame_bandwidth * kf_ratio;
return vp9_rc_clamp_iframe_target_size(cpi, target);
}
@@ -2111,7 +2121,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
int target;
if (!cpi->refresh_alt_ref_frame &&
(cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
- rc->frames_to_key == 0)) {
+ rc->frames_to_key == 0 ||
+ (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) {
cm->frame_type = KEY_FRAME;
rc->this_key_frame_forced =
cm->current_video_frame != 0 && rc->frames_to_key == 0;
@@ -2165,12 +2176,12 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
if (diff > 0) {
// Lower the target bandwidth for this frame.
const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
- target -= (target * pct_low) / 200;
+ target -= (int)(((int64_t)target * pct_low) / 200);
} else if (diff < 0) {
// Increase the target bandwidth for this frame.
const int pct_high =
(int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
- target += (target * pct_high) / 200;
+ target += (int)(((int64_t)target * pct_high) / 200);
}
if (oxcf->rc_max_inter_bitrate_pct) {
const int max_rate =
@@ -2277,14 +2288,15 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
// Periodic key frames is based on the super-frame counter
// (svc.current_superframe), also only base spatial layer is key frame.
// Key frame is set for any of the following: very first frame, frame flags
- // indicates key, superframe counter hits key frequency, or (non-intra) sync
- // flag is set for spatial layer 0.
+ // indicates key, superframe counter hits key frequency,(non-intra) sync
+ // flag is set for spatial layer 0, or deadline mode changes.
if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
(cpi->frame_flags & FRAMEFLAGS_KEY) ||
(cpi->oxcf.auto_key &&
(svc->current_superframe % cpi->oxcf.key_freq == 0) &&
!svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
- (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
+ (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) ||
+ (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
cm->frame_type = KEY_FRAME;
rc->source_alt_ref_active = 0;
if (is_one_pass_svc(cpi)) {
@@ -2438,7 +2450,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
vp9_cyclic_refresh_update_parameters(cpi);
vp9_rc_set_frame_target(cpi, target);
- if (cm->show_frame) update_buffer_level_svc_preencode(cpi);
+ if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi);
if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
@@ -2483,7 +2495,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
int target;
if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
- (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
+ (cpi->oxcf.auto_key && rc->frames_to_key == 0) ||
+ (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
cm->frame_type = KEY_FRAME;
rc->frames_to_key = cpi->oxcf.key_freq;
rc->kf_boost = DEFAULT_KF_BOOST;
@@ -2636,7 +2649,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
int vbr_max_bits;
- rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+ rc->avg_frame_bandwidth =
+ (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX);
rc->min_frame_bandwidth =
(int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
@@ -2690,7 +2704,7 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
}
// Fast redistribution of bits arising from massive local undershoot.
- // Dont do it for kf,arf,gf or overlay frames.
+ // Don't do it for kf,arf,gf or overlay frames.
if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
rc->vbr_bits_off_target_fast) {
int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
@@ -3269,11 +3283,9 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
MODE_INFO **mi = cm->mi_grid_visible;
int sum_intra_usage = 0;
int mi_row, mi_col;
- int tot = 0;
for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
- tot++;
mi++;
}
mi += 8;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 96a8fd3f1..48c49e937 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -350,6 +350,8 @@ void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
+void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 58dd75b44..95c95971c 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
*d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
}
-static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
- int r_q10[MAX_MB_PLANE],
- int d_q10[MAX_MB_PLANE]) {
- int i;
- const int one_q10 = 1 << 10;
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- const int tmp = (xsq_q10[i] >> 2) + 8;
- const int k = get_msb(tmp) - 3;
- const int xq = (k << 3) + ((tmp >> k) & 0x7);
- const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
- const int b_q10 = one_q10 - a_q10;
- r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
- d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
- }
-}
-
static const uint32_t MAX_XSQ_Q10 = 245727;
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
@@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
}
}
-// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
-// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
- unsigned int n_log2[MAX_MB_PLANE],
- unsigned int qstep[MAX_MB_PLANE],
- int64_t *rate_sum, int64_t *dist_sum) {
- int i;
- int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- const uint64_t xsq_q10_64 =
- (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
- var[i];
- xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
- }
- model_rd_norm_vec(xsq_q10, r_q10, d_q10);
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- int rate =
- ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
- int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
- *rate_sum += rate;
- *dist_sum += dist;
- }
-}
-
// Disable gcc 12.2 false positive warning.
// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
#if defined(__GNUC__) && !defined(__clang__)
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index d2bc5e60e..6c61ae514 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -121,11 +121,9 @@ typedef struct RD_OPT {
int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int RDMULT;
int RDDIV;
double r0;
@@ -166,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
unsigned int qstep, int *rate, int64_t *dist);
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
- unsigned int n_log2[MAX_MB_PLANE],
- unsigned int qstep[MAX_MB_PLANE],
- int64_t *rate_sum, int64_t *dist_sum);
-
int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
const MACROBLOCKD *const xd);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a464ce38f..974e43c90 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -77,7 +77,7 @@ struct rdcost_block_args {
int64_t best_rd;
int exit_early;
int use_fast_coef_costing;
- const scan_order *so;
+ const ScanOrder *so;
uint8_t skippable;
struct buf_2d *this_recon;
};
@@ -86,28 +86,28 @@ struct rdcost_block_args {
#if !CONFIG_REALTIME_ONLY
static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- { NEARESTMV, { LAST_FRAME, NONE } },
- { NEARESTMV, { ALTREF_FRAME, NONE } },
- { NEARESTMV, { GOLDEN_FRAME, NONE } },
+ { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } },
+ { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } },
- { DC_PRED, { INTRA_FRAME, NONE } },
+ { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } },
- { NEWMV, { LAST_FRAME, NONE } },
- { NEWMV, { ALTREF_FRAME, NONE } },
- { NEWMV, { GOLDEN_FRAME, NONE } },
+ { NEWMV, { LAST_FRAME, NO_REF_FRAME } },
+ { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } },
- { NEARMV, { LAST_FRAME, NONE } },
- { NEARMV, { ALTREF_FRAME, NONE } },
- { NEARMV, { GOLDEN_FRAME, NONE } },
+ { NEARMV, { LAST_FRAME, NO_REF_FRAME } },
+ { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } },
- { ZEROMV, { LAST_FRAME, NONE } },
- { ZEROMV, { GOLDEN_FRAME, NONE } },
- { ZEROMV, { ALTREF_FRAME, NONE } },
+ { ZEROMV, { LAST_FRAME, NO_REF_FRAME } },
+ { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } },
+ { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } },
{ NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
{ NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
- { TM_PRED, { INTRA_FRAME, NONE } },
+ { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } },
{ NEARMV, { LAST_FRAME, ALTREF_FRAME } },
{ NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -117,20 +117,20 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
{ ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
{ ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
- { H_PRED, { INTRA_FRAME, NONE } },
- { V_PRED, { INTRA_FRAME, NONE } },
- { D135_PRED, { INTRA_FRAME, NONE } },
- { D207_PRED, { INTRA_FRAME, NONE } },
- { D153_PRED, { INTRA_FRAME, NONE } },
- { D63_PRED, { INTRA_FRAME, NONE } },
- { D117_PRED, { INTRA_FRAME, NONE } },
- { D45_PRED, { INTRA_FRAME, NONE } },
+ { H_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { V_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } },
};
static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
- { { LAST_FRAME, NONE } }, { { GOLDEN_FRAME, NONE } },
- { { ALTREF_FRAME, NONE } }, { { LAST_FRAME, ALTREF_FRAME } },
- { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
+ { { LAST_FRAME, NO_REF_FRAME } }, { { GOLDEN_FRAME, NO_REF_FRAME } },
+ { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } },
+ { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } },
};
#endif // !CONFIG_REALTIME_ONLY
@@ -160,10 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
}
#if !CONFIG_REALTIME_ONLY
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
- MACROBLOCKD *xd, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb) {
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+ VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+ MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+ int64_t best_rd) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -176,19 +179,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
int64_t total_sse = 0;
int skip_flag = 1;
const int shift = 6;
- int64_t dist;
const int dequant_shift =
#if CONFIG_VP9_HIGHBITDEPTH
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
#endif // CONFIG_VP9_HIGHBITDEPTH
3;
- unsigned int qstep_vec[MAX_MB_PLANE];
- unsigned int nlog2_vec[MAX_MB_PLANE];
- unsigned int sum_sse_vec[MAX_MB_PLANE];
- int any_zero_sum_sse = 0;
x->pred_sse[ref] = 0;
+ // Build prediction signal, compute stats and RD cost on per-plane basis
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
@@ -207,7 +206,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
int idx, idy;
int lw = b_width_log2_lookup[unit_size] + 2;
int lh = b_height_log2_lookup[unit_size] + 2;
+ unsigned int qstep;
+ unsigned int nlog2;
+ int64_t dist = 0;
+ // Build inter predictor
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+
+ // Compute useful stats
for (idy = 0; idy < bh; ++idy) {
for (idx = 0; idx < bw; ++idx) {
uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -243,46 +249,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
}
total_sse += sum_sse;
- sum_sse_vec[i] = sum_sse;
- any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
- qstep_vec[i] = pd->dequant[1] >> dequant_shift;
- nlog2_vec[i] = num_pels_log2_lookup[bs];
- }
+ qstep = pd->dequant[1] >> dequant_shift;
+ nlog2 = num_pels_log2_lookup[bs];
- // Fast approximate the modelling function.
- if (cpi->sf.simple_model_rd_from_var) {
- for (i = 0; i < MAX_MB_PLANE; ++i) {
+ // Fast approximate the modelling function.
+ if (cpi->sf.simple_model_rd_from_var) {
int64_t rate;
- const int64_t square_error = sum_sse_vec[i];
- int quantizer = qstep_vec[i];
-
- if (quantizer < 120)
- rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+ if (qstep < 120)
+ rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
else
rate = 0;
- dist = (square_error * quantizer) >> 8;
+ dist = ((int64_t)sum_sse * qstep) >> 8;
rate_sum += rate;
- dist_sum += dist;
- }
- } else {
- if (any_zero_sum_sse) {
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- int rate;
- vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
- &rate, &dist);
- rate_sum += rate;
- dist_sum += dist;
- }
} else {
- vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
- &rate_sum, &dist_sum);
+ int rate;
+ vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
+ rate_sum += rate;
+ }
+ dist_sum += dist;
+ if (do_earlyterm) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+ dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+ return 1;
}
}
-
*skip_txfm_sb = skip_flag;
*skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
*out_rate_sum = (int)rate_sum;
*out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+ return 0;
}
#endif // !CONFIG_REALTIME_ONLY
@@ -462,11 +458,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
return cost;
}
-static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
- int subsampling_dim, int blk_dim) {
- return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
-}
-
// Copy all visible 4x4s in the transform block.
static void copy_block_visible(const MACROBLOCKD *xd,
const struct macroblockd_plane *const pd,
@@ -567,47 +558,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
return sse;
}
-// Compute the squares sum squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd,
- const struct macroblockd_plane *const pd,
- const int16_t *diff, const int diff_stride,
- int blk_row, int blk_col,
- const BLOCK_SIZE plane_bsize,
- const BLOCK_SIZE tx_bsize) {
- int64_t sse;
- const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
- const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
- const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
- const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
- int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
- pd->subsampling_x, blk_col);
- int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
- pd->subsampling_y, blk_row);
- if (tx_bsize == BLOCK_4X4 ||
- (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
- assert(tx_4x4_w == tx_4x4_h);
- sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
- } else {
- int r, c;
- int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
- int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
- sse = 0;
- // if we are in the unrestricted motion border.
- for (r = 0; r < max_r; ++r) {
- // Skip visiting the sub blocks that are wholly within the UMV.
- for (c = 0; c < max_c; ++c) {
- sse += (int64_t)vpx_sum_squares_2d_i16(
- diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
- }
- }
- }
- return sse;
-}
-
static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
BLOCK_SIZE plane_bsize, int block, int blk_row,
int blk_col, TX_SIZE tx_size, int64_t *out_dist,
- int64_t *out_sse, struct buf_2d *out_recon) {
+ int64_t *out_sse, struct buf_2d *out_recon,
+ int sse_calc_done) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -633,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
if (x->skip_encode && !is_inter_block(xd->mi[0])) {
// TODO(jingning): tune the model to better capture the distortion.
- const int64_t p =
+ const int64_t mean_quant_error =
(pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
#if CONFIG_VP9_HIGHBITDEPTH
(shift + 2 + (bd - 8) * 2);
#else
(shift + 2);
#endif // CONFIG_VP9_HIGHBITDEPTH
- *out_dist += (p >> 4);
- *out_sse += p;
+ *out_dist += (mean_quant_error >> 4);
+ *out_sse += mean_quant_error;
}
} else {
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
@@ -657,8 +612,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
unsigned int tmp;
- tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
- blk_col, plane_bsize, tx_bsize);
+ if (sse_calc_done) {
+ tmp = (unsigned int)(*out_sse);
+ } else {
+ tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+ blk_col, plane_bsize, tx_bsize);
+ }
*out_sse = (int64_t)tmp * 16;
if (out_recon) {
const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
@@ -754,20 +713,29 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int dst_stride = pd->dst.stride;
const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+ const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
+ const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+ int sse_calc_done = 0;
+#if CONFIG_MISMATCH_DEBUG
+ struct encode_b_args encode_b_arg = {
+ x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+ &sse, args->t_above, args->t_left, &mi->skip,
+ 0, // mi_row
+ 0, // mi_col
+ 0 // output_enabled
+ };
+#else
+ struct encode_b_args encode_b_arg = {
+ x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+ &sse, args->t_above, args->t_left, &mi->skip
+ };
+#endif
if (args->exit_early) return;
if (!is_inter_block(mi)) {
-#if CONFIG_MISMATCH_DEBUG
- struct encode_b_args intra_arg = {
- x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0
- };
-#else
- struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
- args->t_left, &mi->skip };
-#endif
vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
- &intra_arg);
+ &encode_b_arg);
if (recon) {
uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
@@ -775,16 +743,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
}
if (x->block_tx_domain) {
dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
- tx_size, &dist, &sse, /*recon =*/0);
+ tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done);
} else {
const struct macroblock_plane *const p = &x->plane[plane];
const int src_stride = p->src.stride;
- const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
- const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
unsigned int tmp;
- sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
- plane_bsize, tx_bsize);
+ if (!sse_calc_done) {
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *diff =
+ &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+ int visible_width, visible_height;
+ sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+ plane_bsize, tx_bsize, &visible_width,
+ &visible_height);
+ }
#if CONFIG_VP9_HIGHBITDEPTH
if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -808,12 +781,18 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
if (skip_txfm_flag == SKIP_TXFM_NONE ||
(recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *const diff =
+ &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+ const int use_trellis_opt =
+ do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+ tx_size, &encode_b_arg);
// full forward transform and quantization
vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
- if (x->block_qcoeff_opt)
- vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
+ if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
- tx_size, &dist, &sse, recon);
+ tx_size, &dist, &sse, recon, sse_calc_done);
} else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
// compute DC coefficient
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
@@ -1149,13 +1128,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
dst_stride, xd->bd);
if (xd->lossless) {
- const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+ const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
vp9_highbd_fwht4x4(src_diff, coeff, 8);
- vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant,
- eob, so->scan, so->iscan);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+ eob, so);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1166,16 +1144,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
} else {
int64_t unused;
const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
- const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+ const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
if (tx_type == DCT_DCT)
vpx_highbd_fdct4x4(src_diff, coeff, 8);
else
vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
- vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant,
- eob, so->scan, so->iscan);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+ eob, so);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
distortion += vp9_highbd_block_error_dispatch(
@@ -1256,13 +1233,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
if (xd->lossless) {
- const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+ const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
vp9_fwht4x4(src_diff, coeff, 8);
- vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- so->scan, so->iscan);
+ vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+ so);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1273,13 +1249,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
} else {
int64_t unused;
const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
- const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+ const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
const int coeff_ctx =
combine_entropy_contexts(tempa[idx], templ[idy]);
vp9_fht4x4(src_diff, coeff, 8, tx_type);
- vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- so->scan, so->iscan);
+ vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+ so);
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1416,7 +1391,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
mic->mode = mode;
super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
- bsize, best_rd, /*recon = */ 0);
+ bsize, best_rd, /*recon=*/NULL);
if (this_rate_tokenonly == INT_MAX) continue;
@@ -1456,7 +1431,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
if (ref_best_rd < 0) is_cost_valid = 0;
if (is_inter_block(mi) && is_cost_valid) {
- int plane;
for (plane = 1; plane < MAX_MB_PLANE; ++plane)
vp9_subtract_plane(x, bsize, plane);
}
@@ -1469,7 +1443,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
- /*recon = */ 0);
+ /*recon=*/NULL);
if (pnrate == INT_MAX) {
is_cost_valid = 0;
break;
@@ -1652,7 +1626,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
&pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
int64_t thisdistortion = 0, thissse = 0;
int thisrate = 0, ref;
- const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+ const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
const int is_compound = has_second_ref(mi);
const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
@@ -1732,14 +1706,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
#if CONFIG_VP9_HIGHBITDEPTH
- vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
- so->scan, so->iscan);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+ so);
thisdistortion += vp9_highbd_block_error_dispatch(
coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
#else
- vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
+ vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so);
thisdistortion +=
vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -1833,7 +1805,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
const MV_REFERENCE_FRAME ref_frames[2]) {
if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
- (ref_frames[1] == NONE ||
+ (ref_frames[1] == NO_REF_FRAME ||
frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
int rfc = mode_context[ref_frames[0]];
int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1846,7 +1818,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
if (c2 > c3) return 0;
} else {
assert(this_mode == ZEROMV);
- if (ref_frames[1] == NONE) {
+ if (ref_frames[1] == NO_REF_FRAME) {
if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
(c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
return 0;
@@ -1862,10 +1834,80 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
return 1;
}
+static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+ if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
+ int_mv cur_fullpel_mv, prev_fullpel_mv;
+ cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
+ cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3;
+ prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3;
+ prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3;
+ if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1;
+ }
+ return 0;
+}
+
+// Compares motion vector and mode rate of current mode and given mode.
+static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv,
+ int this_mode_rate, int mode_rate,
+ int mv_thresh) {
+ const int mv_diff =
+ abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row);
+ if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1;
+ return 0;
+}
+
+// Skips single reference inter modes NEARMV and ZEROMV based on motion vector
+// difference and mode rate.
+static INLINE int skip_single_mode_based_on_mode_rate(
+ int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode,
+ int ref0, int this_mode_rate, int best_mode_index) {
+ MV this_mv = mode_mv[this_mode][ref0].as_mv;
+ const int mv_thresh = 3;
+
+ // Pruning is not applicable for NEARESTMV or NEWMV modes.
+ if (this_mode == NEARESTMV || this_mode == NEWMV) return 0;
+ // Pruning is not done when reference frame of the mode is same as best
+ // reference so far.
+ if (best_mode_index > 0 &&
+ ref0 == vp9_mode_order[best_mode_index].ref_frame[0])
+ return 0;
+
+ // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV
+ if (compare_mv_mode_rate(
+ this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate,
+ single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh))
+ return 1;
+
+ // Check absolute mv difference and mode rate of current mode w.r.t NEWMV
+ if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate,
+ single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh))
+ return 1;
+
+ // Pruning w.r.t NEARMV is applicable only for ZEROMV mode
+ if (this_mode == NEARMV) return 0;
+ // Check absolute mv difference and mode rate of current mode w.r.t NEARMV
+ if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate,
+ single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh))
+ return 1;
+ return 0;
+}
+
+#define MAX_JOINT_MV_SEARCH_ITERS 4
+static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) {
+ int num_iters = MAX_JOINT_MV_SEARCH_ITERS; // sf_level = 0
+ if (sf_level >= 2)
+ num_iters = 0;
+ else if (sf_level >= 1)
+ num_iters = bsize < BLOCK_8X8
+ ? 0
+ : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS);
+ return num_iters;
+}
+
static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
int_mv *frame_mv, int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
- int *rate_mv) {
+ int *rate_mv, int num_iters) {
const VP9_COMMON *const cm = &cpi->common;
const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -1874,6 +1916,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
const int refs[2] = { mi->ref_frame[0],
mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
int_mv ref_mv[2];
+ int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2];
int ite, ref;
const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
struct scale_factors sf;
@@ -1888,12 +1931,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
// Prediction buffer from second frame.
#if CONFIG_VP9_HIGHBITDEPTH
- DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+ DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
uint8_t *second_pred;
#else
- DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+ DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
#endif // CONFIG_VP9_HIGHBITDEPTH
+ // Check number of iterations do not exceed the max
+ assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS);
+
for (ref = 0; ref < 2; ++ref) {
ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
@@ -1909,6 +1955,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
}
frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+ iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int;
}
// Since we have scaled the reference frames to match the size of the current
@@ -1923,7 +1970,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
// Allow joint search multiple times iteratively for each reference frame
// and break out of the search loop if it couldn't find a better mv.
- for (ite = 0; ite < 4; ite++) {
+ for (ite = 0; ite < num_iters; ite++) {
struct buf_2d ref_yv12[2];
uint32_t bestsme = UINT_MAX;
int sadpb = x->sadperbit16;
@@ -1935,6 +1982,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
// odd iterations search in the second. The predictor
// found for the 'other' reference frame is factored in.
+ // Skip further iterations of search if in the previous iteration, the
+ // motion vector of the searched ref frame is unchanged, and the other ref
+ // frame's full-pixel mv is unchanged.
+ if (skip_iters(iter_mvs, ite, id)) break;
+
// Initialized here because of compiler problem in Visual Studio.
ref_yv12[0] = xd->plane[0].pre[0];
ref_yv12[1] = xd->plane[0].pre[1];
@@ -2000,6 +2052,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
} else {
break;
}
+ if (ite < num_iters - 1) {
+ iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
+ iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
+ }
}
*rate_mv = 0;
@@ -2020,7 +2076,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
static int64_t rd_pick_best_sub8x8_mode(
VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv,
- int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+ int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate,
int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
int filter_idx, int mi_row, int mi_col) {
@@ -2053,7 +2109,7 @@ static int64_t rd_pick_best_sub8x8_mode(
vp9_zero(*bsi);
- bsi->segment_rd = best_rd;
+ bsi->segment_rd = best_rd_so_far;
bsi->ref_mv[0] = best_ref_mv;
bsi->ref_mv[1] = second_best_ref_mv;
bsi->mvp.as_int = best_ref_mv->as_int;
@@ -2079,14 +2135,14 @@ static int64_t rd_pick_best_sub8x8_mode(
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
PREDICTION_MODE mode_selected = ZEROMV;
int64_t best_rd = INT64_MAX;
- const int i = idy * 2 + idx;
+ const int block = idy * 2 + idx;
int ref;
for (ref = 0; ref < 1 + has_second_rf; ++ref) {
const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
frame_mv[ZEROMV][frame].as_int = 0;
vp9_append_sub8x8_mvs_for_idx(
- cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
+ cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
&frame_mv[NEARMV][frame], mbmi_ext->mode_context);
}
@@ -2096,7 +2152,7 @@ static int64_t rd_pick_best_sub8x8_mode(
struct buf_2d orig_pre[2];
mode_idx = INTER_OFFSET(this_mode);
- bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+ bsi->rdstat[block][mode_idx].brdcost = INT64_MAX;
if (!(inter_mode_mask & (1 << this_mode))) continue;
if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
@@ -2104,14 +2160,14 @@ static int64_t rd_pick_best_sub8x8_mode(
continue;
memcpy(orig_pre, pd->pre, sizeof(orig_pre));
- memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
- sizeof(bsi->rdstat[i][mode_idx].ta));
- memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
- sizeof(bsi->rdstat[i][mode_idx].tl));
+ memcpy(bsi->rdstat[block][mode_idx].ta, t_above,
+ sizeof(bsi->rdstat[block][mode_idx].ta));
+ memcpy(bsi->rdstat[block][mode_idx].tl, t_left,
+ sizeof(bsi->rdstat[block][mode_idx].tl));
// motion search for newmv (single predictor case only)
if (!has_second_rf && this_mode == NEWMV &&
- seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
+ seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) {
MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
int step_param = 0;
uint32_t bestsme = UINT_MAX;
@@ -2121,18 +2177,19 @@ static int64_t rd_pick_best_sub8x8_mode(
int cost_list[5];
const MvLimits tmp_mv_limits = x->mv_limits;
- /* Is the best so far sufficiently good that we cant justify doing
+ /* Is the best so far sufficiently good that we can't justify doing
* and new motion search. */
if (best_rd < label_mv_thresh) break;
if (cpi->oxcf.mode != BEST) {
// use previous block's result as next block's MV predictor.
- if (i > 0) {
- bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
- if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+ if (block > 0) {
+ bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int;
+ if (block == 2)
+ bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int;
}
}
- if (i == 0)
+ if (block == 0)
max_mv = x->max_mv_context[mi->ref_frame[0]];
else
max_mv =
@@ -2161,7 +2218,7 @@ static int64_t rd_pick_best_sub8x8_mode(
}
// adjust src pointer for this block
- mi_buf_shift(x, i);
+ mi_buf_shift(x, block);
vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
@@ -2184,7 +2241,7 @@ static int64_t rd_pick_best_sub8x8_mode(
cpi->sf.use_accurate_subpel_search);
// save motion search result for use in compound prediction
- seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
+ seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv;
}
x->pred_mv[mi->ref_frame[0]] = *new_mv;
@@ -2194,40 +2251,44 @@ static int64_t rd_pick_best_sub8x8_mode(
}
if (has_second_rf) {
- if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
- seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
+ if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV ||
+ seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV)
continue;
}
if (has_second_rf && this_mode == NEWMV &&
mi->interp_filter == EIGHTTAP) {
+ // Decide number of joint motion search iterations
+ const int num_joint_search_iters = get_joint_search_iters(
+ cpi->sf.comp_inter_joint_search_iter_level, bsize);
// adjust src pointers
- mi_buf_shift(x, i);
- if (sf->comp_inter_joint_search_thresh <= bsize) {
+ mi_buf_shift(x, block);
+ if (num_joint_search_iters) {
int rate_mv;
joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
- mi_col, seg_mvs[i], &rate_mv);
- seg_mvs[i][mi->ref_frame[0]].as_int =
+ mi_col, seg_mvs[block], &rate_mv,
+ num_joint_search_iters);
+ seg_mvs[block][mi->ref_frame[0]].as_int =
frame_mv[this_mode][mi->ref_frame[0]].as_int;
- seg_mvs[i][mi->ref_frame[1]].as_int =
+ seg_mvs[block][mi->ref_frame[1]].as_int =
frame_mv[this_mode][mi->ref_frame[1]].as_int;
}
// restore src pointers
mi_buf_restore(x, orig_src, orig_pre);
}
- bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs(
- cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i],
- bsi->ref_mv, x->nmvjointcost, x->mvcost);
+ bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs(
+ cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv,
+ seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost);
for (ref = 0; ref < 1 + has_second_rf; ++ref) {
- bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+ bsi->rdstat[block][mode_idx].mvs[ref].as_int =
mode_mv[this_mode][ref].as_int;
if (num_4x4_blocks_wide > 1)
- bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+ bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int =
mode_mv[this_mode][ref].as_int;
if (num_4x4_blocks_high > 1)
- bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+ bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int =
mode_mv[this_mode][ref].as_int;
}
@@ -2245,7 +2306,7 @@ static int64_t rd_pick_best_sub8x8_mode(
for (ref = 0; ref < 1 + has_second_rf; ++ref) {
subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
have_ref &= mode_mv[this_mode][ref].as_int ==
- ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+ ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
}
if (filter_idx > 1 && !subpelmv && !have_ref) {
@@ -2253,53 +2314,55 @@ static int64_t rd_pick_best_sub8x8_mode(
have_ref = 1;
for (ref = 0; ref < 1 + has_second_rf; ++ref)
have_ref &= mode_mv[this_mode][ref].as_int ==
- ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+ ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
}
if (!subpelmv && have_ref &&
- ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
- memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
- sizeof(SEG_RDSTAT));
+ ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+ memcpy(&bsi->rdstat[block][mode_idx],
+ &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT));
if (num_4x4_blocks_wide > 1)
- bsi->rdstat[i + 1][mode_idx].eobs =
- ref_bsi->rdstat[i + 1][mode_idx].eobs;
+ bsi->rdstat[block + 1][mode_idx].eobs =
+ ref_bsi->rdstat[block + 1][mode_idx].eobs;
if (num_4x4_blocks_high > 1)
- bsi->rdstat[i + 2][mode_idx].eobs =
- ref_bsi->rdstat[i + 2][mode_idx].eobs;
+ bsi->rdstat[block + 2][mode_idx].eobs =
+ ref_bsi->rdstat[block + 2][mode_idx].eobs;
- if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+ if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
mode_selected = this_mode;
- best_rd = bsi->rdstat[i][mode_idx].brdcost;
+ best_rd = bsi->rdstat[block][mode_idx].brdcost;
}
continue;
}
}
- bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment(
- cpi, x, bsi->segment_rd - this_segment_rd, i,
- &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist,
- &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta,
- bsi->rdstat[i][mode_idx].tl, mi_row, mi_col);
- if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
- bsi->rdstat[i][mode_idx].brdcost +=
- RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0);
- bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
- bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+ bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment(
+ cpi, x, bsi->segment_rd - this_segment_rd, block,
+ &bsi->rdstat[block][mode_idx].byrate,
+ &bsi->rdstat[block][mode_idx].bdist,
+ &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta,
+ bsi->rdstat[block][mode_idx].tl, mi_row, mi_col);
+ if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+ bsi->rdstat[block][mode_idx].brdcost += RDCOST(
+ x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0);
+ bsi->rdstat[block][mode_idx].brate +=
+ bsi->rdstat[block][mode_idx].byrate;
+ bsi->rdstat[block][mode_idx].eobs = p->eobs[block];
if (num_4x4_blocks_wide > 1)
- bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+ bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1];
if (num_4x4_blocks_high > 1)
- bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+ bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2];
}
- if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+ if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
mode_selected = this_mode;
- best_rd = bsi->rdstat[i][mode_idx].brdcost;
+ best_rd = bsi->rdstat[block][mode_idx].brdcost;
}
} /*for each 4x4 mode*/
if (best_rd == INT64_MAX) {
int iy, midx;
- for (iy = i + 1; iy < 4; ++iy)
+ for (iy = block + 1; iy < 4; ++iy)
for (midx = 0; midx < INTER_MODES; ++midx)
bsi->rdstat[iy][midx].brdcost = INT64_MAX;
bsi->segment_rd = INT64_MAX;
@@ -2307,22 +2370,22 @@ static int64_t rd_pick_best_sub8x8_mode(
}
mode_idx = INTER_OFFSET(mode_selected);
- memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
- memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+ memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above));
+ memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left));
- set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
- frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
- x->mvcost);
+ set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected,
+ mode_mv[mode_selected], frame_mv, seg_mvs[block],
+ bsi->ref_mv, x->nmvjointcost, x->mvcost);
- br += bsi->rdstat[i][mode_idx].brate;
- bd += bsi->rdstat[i][mode_idx].bdist;
- block_sse += bsi->rdstat[i][mode_idx].bsse;
- segmentyrate += bsi->rdstat[i][mode_idx].byrate;
- this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+ br += bsi->rdstat[block][mode_idx].brate;
+ bd += bsi->rdstat[block][mode_idx].bdist;
+ block_sse += bsi->rdstat[block][mode_idx].bsse;
+ segmentyrate += bsi->rdstat[block][mode_idx].byrate;
+ this_segment_rd += bsi->rdstat[block][mode_idx].brdcost;
if (this_segment_rd > bsi->segment_rd) {
int iy, midx;
- for (iy = i + 1; iy < 4; ++iy)
+ for (iy = block + 1; iy < 4; ++iy)
for (midx = 0; midx < INTER_MODES; ++midx)
bsi->rdstat[iy][midx].brdcost = INT64_MAX;
bsi->segment_rd = INT64_MAX;
@@ -2340,7 +2403,7 @@ static int64_t rd_pick_best_sub8x8_mode(
// update the coding decisions
for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
- if (bsi->segment_rd > best_rd) return INT64_MAX;
+ if (bsi->segment_rd > best_rd_so_far) return INT64_MAX;
/* set it to the best */
for (i = 0; i < 4; i++) {
mode_idx = INTER_OFFSET(bsi->modes[i]);
@@ -2585,9 +2648,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
tmp_mv->as_int = INVALID_MV;
if (scaled_ref_frame) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; ++i)
- xd->plane[i].pre[0] = backup_yv12[i];
+ int j;
+ for (j = 0; j < MAX_MB_PLANE; ++j)
+ xd->plane[j].pre[0] = backup_yv12[j];
}
return;
}
@@ -2752,8 +2815,9 @@ static int64_t handle_inter_mode(
struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
- int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
- const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) {
+ int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate,
+ int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter,
+ int64_t filter_cache[], int best_mode_index) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MODE_INFO *mi = xd->mi[0];
@@ -2771,9 +2835,8 @@ static int64_t handle_inter_mode(
#else
DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
#endif // CONFIG_VP9_HIGHBITDEPTH
- int pred_exists = 0;
int intpel_mv;
- int64_t rd, tmp_rd, best_rd = INT64_MAX;
+ int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX;
int best_needs_copy = 0;
uint8_t *orig_dst[MAX_MB_PLANE];
int orig_dst_stride[MAX_MB_PLANE];
@@ -2782,13 +2845,12 @@ static int64_t handle_inter_mode(
uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 };
int64_t bsse[MAX_MB_PLANE << 2] = { 0 };
- int bsl = mi_width_log2_lookup[bsize];
- int pred_filter_search =
- cpi->sf.cb_pred_filter_search
- ? (((mi_row + mi_col) >> bsl) +
- get_chessboard_index(cm->current_video_frame)) &
- 0x1
- : 0;
+ const int bsl = mi_width_log2_lookup[bsize];
+ const int blk_parity = (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1;
+ const int pred_filter_search =
+ (cpi->sf.cb_pred_filter_search >= 2) && blk_parity;
int skip_txfm_sb = 0;
int64_t skip_sse_sb = INT64_MAX;
@@ -2827,13 +2889,23 @@ static int64_t handle_inter_mode(
if (this_mode == NEWMV) {
int rate_mv;
if (is_comp_pred) {
+ // Decide number of joint motion search iterations
+ const int num_joint_search_iters = get_joint_search_iters(
+ cpi->sf.comp_inter_joint_search_iter_level, bsize);
+
// Initialize mv using single prediction mode result.
frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
- if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ if (num_joint_search_iters) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, joint_motion_search_time);
+#endif
joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
- single_newmv, &rate_mv);
+ single_newmv, &rate_mv, num_joint_search_iters);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, joint_motion_search_time);
+#endif
} else {
rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
&x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -2845,7 +2917,13 @@ static int64_t handle_inter_mode(
*rate2 += rate_mv;
} else {
int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, single_motion_search_time);
+#endif
single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, single_motion_search_time);
+#endif
if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@@ -2899,23 +2977,45 @@ static int64_t handle_inter_mode(
*rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
}
+ if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) {
+ single_mode_rate[INTER_OFFSET(this_mode)] = *rate2;
+ // Prune NEARMV and ZEROMV modes based on motion vector difference and mode
+ // rate.
+ if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
+ this_mode, refs[0], *rate2,
+ best_mode_index)) {
+ // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes
+ // are not early terminated. This ensures all single modes are not getting
+ // skipped when the speed feature is enabled.
+ assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX ||
+ single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX);
+ return INT64_MAX;
+ }
+ }
if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
mi->mode != NEARESTMV)
return INT64_MAX;
- pred_exists = 0;
// Are all MVs integer pel for Y and UV
intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, interp_filter_time);
+#endif
// Search for best switchable filter by checking the variance of
// pred error irrespective of whether the filter will be used
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
if (cm->interp_filter != BILINEAR) {
+ // Use cb pattern for filter eval when filter is not switchable
+ const int enable_interp_search =
+ (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE)
+ ? blk_parity
+ : 1;
if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
best_filter = EIGHTTAP;
- } else if (best_filter == SWITCHABLE) {
+ } else if (best_filter == SWITCHABLE && enable_interp_search) {
int newbest;
int tmp_rate_sum = 0;
int64_t tmp_dist_sum = 0;
@@ -2925,6 +3025,9 @@ static int64_t handle_inter_mode(
int64_t rs_rd;
int tmp_skip_sb = 0;
int64_t tmp_skip_sse = INT64_MAX;
+ const int enable_earlyterm =
+ cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+ int64_t filt_best_rd;
mi->interp_filter = i;
rs = vp9_get_switchable_rate(cpi, xd);
@@ -2958,9 +3061,16 @@ static int64_t handle_inter_mode(
xd->plane[j].dst.stride = 64;
}
}
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb,
- &tmp_skip_sse);
+
+ filt_best_rd =
+ cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+ if (build_inter_pred_model_rd_earlyterm(
+ cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+ &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+ filt_best_rd)) {
+ filter_cache[i] = INT64_MAX;
+ continue;
+ }
rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
filter_cache[i] = rd;
@@ -2993,7 +3103,6 @@ static int64_t handle_inter_mode(
if ((cm->interp_filter == SWITCHABLE && newbest) ||
(cm->interp_filter != SWITCHABLE &&
cm->interp_filter == mi->interp_filter)) {
- pred_exists = 1;
tmp_rd = best_rd;
skip_txfm_sb = tmp_skip_sb;
@@ -3005,12 +3114,15 @@ static int64_t handle_inter_mode(
restore_dst_buf(xd, orig_dst, orig_dst_stride);
}
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, interp_filter_time);
+#endif
// Set the appropriate filter
mi->interp_filter =
cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
- if (pred_exists) {
+ if (tmp_rd != INT64_MAX) {
if (best_needs_copy) {
// again temporarily set the buffers to local memory to prevent a memcpy
for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -3025,9 +3137,9 @@ static int64_t handle_inter_mode(
// Handles the special case when a filter that is not in the
// switchable list (ex. bilinear) is indicated at the frame level, or
// skip condition holds.
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
- &skip_sse_sb);
+ build_inter_pred_model_rd_earlyterm(
+ cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+ &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
memcpy(bsse, x->bsse, sizeof(bsse));
@@ -3120,7 +3232,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
x->skip_encode = 0;
ctx->skip = 0;
xd->mi[0]->ref_frame[0] = INTRA_FRAME;
- xd->mi[0]->ref_frame[1] = NONE;
+ xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
// Initialize interp_filter here so we do not have to check for inter block
// modes in get_pred_context_switchable_interp()
xd->mi[0]->interp_filter = SWITCHABLE_FILTERS;
@@ -3344,6 +3456,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int single_mode_rate[MAX_REF_FRAMES][INTER_MODES];
int64_t best_rd = best_rd_so_far;
int64_t best_pred_diff[REFERENCE_MODES];
int64_t best_pred_rd[REFERENCE_MODES];
@@ -3493,7 +3606,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
}
- if (bsize > sf->max_intra_bsize) {
+ if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) {
ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
}
@@ -3542,6 +3655,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
vp9_zero(x->sum_y_eobs);
+ comp_pred = second_ref_frame > INTRA_FRAME;
+ if (!comp_pred && ref_frame != INTRA_FRAME &&
+ sf->prune_single_mode_based_on_mv_diff_mode_rate)
+ single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX;
if (is_rect_partition) {
if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
@@ -3560,7 +3677,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
break;
case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
- case NONE:
+ case NO_REF_FRAME:
case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
}
}
@@ -3593,7 +3710,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
MODE_INFO *ref_mi;
int const_motion = 1;
int skip_ref_frame = !cb_partition_search_ctrl;
- MV_REFERENCE_FRAME rf = NONE;
+ MV_REFERENCE_FRAME rf = NO_REF_FRAME;
int_mv ref_mv;
ref_mv.as_int = INVALID_MV;
@@ -3610,7 +3727,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
if ((mi_col - 1) >= tile_info->mi_col_start) {
if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0];
- if (rf == NONE) rf = xd->mi[-1]->ref_frame[0];
+ if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0];
for (i = 0; i < mi_height; ++i) {
ref_mi = xd->mi[i * xd->mi_stride - 1];
const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
@@ -3627,7 +3744,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
if (this_mode == NEARMV || this_mode == ZEROMV) continue;
}
- comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
if (!cpi->allow_comp_inter_inter) continue;
@@ -3707,19 +3823,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, intra_mode_search_time);
+#endif
memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, intra_mode_search_time);
+#endif
if (rate_y == INT_MAX) continue;
uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
[pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, intra_mode_search_time);
+#endif
if (rate_uv_intra[uv_tx] == INT_MAX) {
choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
&rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
&skip_uv[uv_tx], &mode_uv[uv_tx]);
}
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, intra_mode_search_time);
+#endif
rate_uv = rate_uv_tokenonly[uv_tx];
distortion_uv = dist_uv[uv_tx];
skippable = skippable && skip_uv[uv_tx];
@@ -3730,11 +3857,18 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
} else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_inter_mode_time);
+#endif
this_rd = handle_inter_mode(
cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
- single_inter_filter, single_skippable, &total_sse, best_rd,
- &mask_filter, filter_cache);
+ single_inter_filter, single_skippable,
+ &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter,
+ filter_cache, best_mode_index);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_inter_mode_time);
+#endif
if (this_rd == INT64_MAX) continue;
compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@@ -3970,13 +4104,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
}
if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-// If adaptive interp filter is enabled, then the current leaf node of 8x8
-// data is needed for sub8x8. Hence preserve the context.
-#if CONFIG_CONSISTENT_RECODE
+ // If adaptive interp filter is enabled, then the current leaf node of 8x8
+ // data is needed for sub8x8. Hence preserve the context.
if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#else
- if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#endif
rd_cost->rate = INT_MAX;
rd_cost->rdcost = INT64_MAX;
return;
@@ -4091,7 +4221,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
mi->mode = ZEROMV;
mi->uv_mode = DC_PRED;
mi->ref_frame[0] = LAST_FRAME;
- mi->ref_frame[1] = NONE;
+ mi->ref_frame[1] = NO_REF_FRAME;
mi->mv[0].as_int = 0;
x->skip = 1;
@@ -4236,7 +4366,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
int rate2 = 0, rate_y = 0, rate_uv = 0;
int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
int skippable = 0;
- int i;
int this_skip2 = 0;
int64_t total_sse = INT_MAX;
int early_term = 0;
@@ -4274,7 +4403,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
case ALTREF_FRAME:
ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
break;
- case NONE:
+ case NO_REF_FRAME:
case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
}
}
@@ -4397,7 +4526,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
: NULL;
if (scaled_ref_frame[ref]) {
- int i;
// Swap out the reference frame for a version that's been scaled to
// match the resolution of the current frame, allowing the existing
// motion search code to be used without additional modifications.
@@ -4534,14 +4662,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
if (tmp_best_rdu > 0) {
// If even the 'Y' rd value of split is higher than best so far
- // then dont bother looking at UV
+ // then don't bother looking at UV
vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
&uv_sse, BLOCK_8X8, tmp_best_rdu)) {
for (ref = 0; ref < 2; ++ref) {
if (scaled_ref_frame[ref]) {
- int i;
for (i = 0; i < MAX_MB_PLANE; ++i)
xd->plane[i].pre[ref] = backup_yv12[ref][i];
}
@@ -4558,7 +4685,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
for (ref = 0; ref < 2; ++ref) {
if (scaled_ref_frame[ref]) {
// Restore the prediction frame pointers to their unscaled versions.
- int i;
for (i = 0; i < MAX_MB_PLANE; ++i)
xd->plane[i].pre[ref] = backup_yv12[ref][i];
}
@@ -4764,7 +4890,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
}
// If the second reference does not exist, set the corresponding mv to zero.
- if (mi->ref_frame[1] == NONE) {
+ if (mi->ref_frame[1] == NO_REF_FRAME) {
mi->mv[1].as_int = 0;
for (i = 0; i < 4; ++i) {
mi->bmi[i].as_mv[1].as_int = 0;
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 7486dee25..ca55ec988 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) {
while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
++steps;
in_length = proj_in_length;
+ if (in_length == 1) {
+ // Special case: we break because any further calls to get_down2_length()
+ // with be with length == 1, which return 1, resulting in an infinite
+ // loop.
+ break;
+ }
}
return steps;
}
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0431d8a45..56fb5f94f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -16,8 +16,11 @@
#include "vpx_dsp/vpx_dsp_common.h"
// Mesh search patters for various speed settings
-static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
- { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+ { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
};
#if !CONFIG_REALTIME_ONLY
@@ -39,7 +42,7 @@ static int frame_is_boosted(const VP9_COMP *cpi) {
// Sets a partition size down to which the auto partition code will always
// search (can go lower), based on the image dimensions. The logic here
// is that the extent to which ringing artefacts are offensive, depends
-// partly on the screen area that over which they propogate. Propogation is
+// partly on the screen area that over which they propagate. Propagation is
// limited by transform block size but the screen area take up by a given block
// size will be larger for a small image format stretched to full screen.
static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
@@ -67,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
const int is_720p_or_larger = min_frame_size >= 720;
const int is_1080p_or_larger = min_frame_size >= 1080;
const int is_2160p_or_larger = min_frame_size >= 2160;
+ const int boosted = frame_is_boosted(cpi);
// speed 0 features
sf->partition_search_breakout_thr.dist = (1 << 20);
@@ -78,9 +82,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
// Currently, the machine-learning based partition search early termination
// is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
sf->rd_ml_partition.search_early_termination = 1;
+ sf->recode_tolerance_high = 45;
} else {
sf->use_square_only_thresh_high = BLOCK_32X32;
}
+ if (is_720p_or_larger) {
+ sf->alt_ref_search_fp = 1;
+ }
if (!is_1080p_or_larger) {
sf->rd_ml_partition.search_breakout = 1;
@@ -95,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
}
}
+ if (!is_720p_or_larger) {
+ if (is_480p_or_larger)
+ sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1;
+ else
+ sf->prune_single_mode_based_on_mv_diff_mode_rate = 1;
+ }
+
if (speed >= 1) {
sf->rd_ml_partition.search_early_termination = 0;
sf->rd_ml_partition.search_breakout = 1;
@@ -152,7 +167,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
sf->alt_ref_search_fp = 1;
- sf->cb_pred_filter_search = 1;
+ sf->cb_pred_filter_search = 2;
sf->adaptive_interp_filter_search = 1;
sf->disable_split_mask = DISABLE_ALL_SPLIT;
}
@@ -209,15 +224,32 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
const int boosted = frame_is_boosted(cpi);
int i;
- sf->tx_size_search_breakout = 1;
+ sf->adaptive_interp_filter_search = 1;
+ sf->adaptive_pred_interp_filter = 1;
sf->adaptive_rd_thresh = 1;
sf->adaptive_rd_thresh_row_mt = 0;
sf->allow_skip_recode = 1;
sf->less_rectangular_check = 1;
- sf->use_square_partition_only = !boosted;
+ sf->mv.auto_mv_step_size = 1;
+ sf->mv.use_downsampled_sad = 1;
sf->prune_ref_frame_for_rect_partitions = 1;
- sf->rd_ml_partition.var_pruning = 1;
+ sf->temporal_filter_search_method = NSTEP;
+ sf->tx_size_search_breakout = 1;
+ sf->use_square_partition_only = !boosted;
+ sf->early_term_interp_search_plane_rd = 1;
+ sf->cb_pred_filter_search = 1;
+ sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+ ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+ : DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
+
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->comp_inter_joint_search_iter_level = 1;
+
+ // Reference masking is not supported in dynamic scaling mode.
+ sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
+ sf->rd_ml_partition.var_pruning = 1;
sf->rd_ml_partition.prune_rect_thresh[0] = -1;
sf->rd_ml_partition.prune_rect_thresh[1] = 350;
sf->rd_ml_partition.prune_rect_thresh[2] = 325;
@@ -238,7 +270,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
}
if (speed >= 1) {
- sf->temporal_filter_search_method = NSTEP;
sf->rd_ml_partition.var_pruning = !boosted;
sf->rd_ml_partition.prune_rect_thresh[1] = 225;
sf->rd_ml_partition.prune_rect_thresh[2] = 225;
@@ -258,19 +289,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->allow_txfm_domain_distortion = 1;
sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
- sf->allow_quant_coeff_opt = sf->optimize_coefficients;
- sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
+ sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+ ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR
+ : DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5];
sf->less_rectangular_check = 1;
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
- sf->mv.auto_mv_step_size = 1;
sf->adaptive_rd_thresh = 2;
sf->mv.subpel_search_level = 1;
if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
- sf->adaptive_pred_interp_filter = 1;
sf->allow_acl = 0;
- sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -296,18 +326,14 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->tx_size_search_method =
frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
- // Reference masking is not supported in dynamic scaling mode.
- sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
sf->mode_search_skip_flags =
(cm->frame_type == KEY_FRAME)
? 0
: FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
sf->disable_filter_search_var_thresh = 100;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+ sf->comp_inter_joint_search_iter_level = 2;
sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
- sf->recode_tolerance_low = 15;
sf->recode_tolerance_high = 45;
sf->enhanced_full_pixel_motion_search = 0;
sf->prune_ref_frame_for_rect_partitions = 0;
@@ -337,14 +363,13 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 1;
sf->cb_partition_search = !boosted;
- sf->cb_pred_filter_search = 1;
+ sf->cb_pred_filter_search = 2;
sf->alt_ref_search_fp = 1;
sf->recode_loop = ALLOW_RECODE_KFMAXBW;
sf->adaptive_rd_thresh = 3;
sf->mode_skip_start = 6;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
- sf->adaptive_interp_filter_search = 1;
if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -373,7 +398,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
}
if (speed >= 5) {
- int i;
sf->optimize_coefficients = 0;
sf->mv.search_method = HEX;
sf->disable_filter_search_var_thresh = 500;
@@ -461,8 +485,8 @@ static void set_rt_speed_feature_framesize_independent(
if (speed >= 1) {
sf->allow_txfm_domain_distortion = 1;
sf->tx_domain_thresh = 0.0;
- sf->allow_quant_coeff_opt = 0;
- sf->quant_opt_thresh = 0.0;
+ sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = 0.0;
sf->use_square_partition_only = !frame_is_intra_only(cm);
sf->less_rectangular_check = 1;
sf->tx_size_search_method =
@@ -507,7 +531,7 @@ static void set_rt_speed_feature_framesize_independent(
}
sf->disable_filter_search_var_thresh = 50;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+ sf->comp_inter_joint_search_iter_level = 2;
sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
sf->adjust_partitioning_from_last_frame = 1;
@@ -631,7 +655,7 @@ static void set_rt_speed_feature_framesize_independent(
sf->use_altref_onepass = 1;
sf->use_compound_nonrd_pickmode = 1;
}
- if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1;
+ if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2;
if (!cpi->external_resize) sf->use_source_sad = 1;
}
@@ -652,7 +676,7 @@ static void set_rt_speed_feature_framesize_independent(
if (cpi->content_state_sb_fd == NULL &&
(!cpi->use_svc ||
svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
- CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+ CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
(uint8_t *)vpx_calloc(
(cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(uint8_t)));
@@ -721,7 +745,7 @@ static void set_rt_speed_feature_framesize_independent(
if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
svc->temporal_layer_id > 0)
cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
- if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1;
+ if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2;
}
if (speed >= 8) {
@@ -765,7 +789,7 @@ static void set_rt_speed_feature_framesize_independent(
}
sf->limit_newmv_early_exit = 0;
sf->use_simple_block_yrd = 1;
- if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1;
+ if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2;
}
if (speed >= 9) {
@@ -775,7 +799,7 @@ static void set_rt_speed_feature_framesize_independent(
for (i = 0; i < BLOCK_SIZES; ++i)
sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
}
- sf->cb_pred_filter_search = 1;
+ sf->cb_pred_filter_search = 2;
sf->mv.enable_adaptive_subpel_force_stop = 1;
sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
@@ -808,13 +832,13 @@ static void set_rt_speed_feature_framesize_independent(
}
if (cpi->count_arf_frame_usage == NULL) {
CHECK_MEM_ERROR(
- cm, cpi->count_arf_frame_usage,
+ &cm->error, cpi->count_arf_frame_usage,
(uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->count_arf_frame_usage)));
}
if (cpi->count_lastgolden_frame_usage == NULL)
CHECK_MEM_ERROR(
- cm, cpi->count_lastgolden_frame_usage,
+ &cm->error, cpi->count_lastgolden_frame_usage,
(uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
sizeof(*cpi->count_lastgolden_frame_usage)));
}
@@ -835,6 +859,11 @@ static void set_rt_speed_feature_framesize_independent(
// off for now.
if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
cpi->oxcf.aq_mode = 0;
+ // For all speeds for rt mode: if the deadline mode changed (was good/best
+ // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to
+ // avoid entering rd pickmode. This causes issues, such as: b/310663186.
+ if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)
+ sf->nonrd_keyframe = 1;
}
void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {
@@ -904,14 +933,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
sf->coeff_prob_appx_step = 1;
sf->mv.auto_mv_step_size = 0;
sf->mv.fullpel_search_step_param = 6;
- sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+ sf->mv.use_downsampled_sad = 0;
+ sf->comp_inter_joint_search_iter_level = 0;
sf->tx_size_search_method = USE_FULL_RD;
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
sf->enhanced_full_pixel_motion_search = 1;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 0;
+ sf->prune_single_mode_based_on_mv_diff_mode_rate = 0;
sf->cb_pred_filter_search = 0;
+ sf->early_term_interp_search_plane_rd = 0;
sf->cb_partition_search = 0;
sf->motion_field_mode_search = 0;
sf->alt_ref_search_fp = 0;
@@ -936,8 +968,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
sf->adaptive_interp_filter_search = 0;
sf->allow_txfm_domain_distortion = 0;
sf->tx_domain_thresh = 99.0;
- sf->allow_quant_coeff_opt = sf->optimize_coefficients;
- sf->quant_opt_thresh = 99.0;
+ sf->trellis_opt_tx_rd.method =
+ sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = 99.0;
sf->allow_acl = 1;
sf->enable_tpl_model = oxcf->enable_tpl_model;
sf->prune_ref_frame_for_rect_partitions = 0;
@@ -991,10 +1024,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
sf->exhaustive_searches_thresh =
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
: INT_MAX;
- if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ {
+ const int mesh_density_level =
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
for (i = 0; i < MAX_MESH_STEP; ++i) {
- sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
- sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+ sf->mesh_patterns[i].range =
+ best_quality_mesh_pattern[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ best_quality_mesh_pattern[mesh_density_level][i].interval;
}
}
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c2ae970b7..941de639a 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES {
// This variable sets the step_param used in full pel motion search.
int fullpel_search_step_param;
+
+ // Whether to downsample the rows in sad calculation during motion search.
+ // This is only active when there are at least 8 rows.
+ int use_downsampled_sad;
} MV_SPEED_FEATURES;
typedef struct PARTITION_SEARCH_BREAKOUT_THR {
@@ -246,6 +250,24 @@ typedef enum {
USE_8_TAPS_SHARP,
} SUBPEL_SEARCH_TYPE;
+typedef enum {
+ // Disable trellis coefficient optimization
+ DISABLE_TRELLIS_OPT,
+ // Enable trellis coefficient optimization
+ ENABLE_TRELLIS_OPT,
+ // Enable trellis coefficient optimization based on source variance of the
+ // prediction block during transform RD
+ ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+ // Enable trellis coefficient optimization based on residual mse of the
+ // transform block during transform RD
+ ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
+} ENABLE_TRELLIS_OPT_METHOD;
+
+typedef struct TRELLIS_OPT_CONTROL {
+ ENABLE_TRELLIS_OPT_METHOD method;
+ double thresh;
+} TRELLIS_OPT_CONTROL;
+
typedef struct SPEED_FEATURES {
MV_SPEED_FEATURES mv;
@@ -264,11 +286,20 @@ typedef struct SPEED_FEATURES {
// adds overhead.
int static_segmentation;
- // If 1 we iterate finding a best reference for 2 ref frames together - via
- // a log search that iterates 4 times (check around mv for last for best
- // error of combined predictor then check around mv for alt). If 0 we
- // we just use the best motion vector found for each frame by itself.
- BLOCK_SIZE comp_inter_joint_search_thresh;
+ // The best compound predictor is found using an iterative log search process
+ // that searches for best ref0 mv using error of combined predictor and then
+ // searches for best ref1 mv. This sf determines the number of iterations of
+ // this process based on block size. The sf becomes more aggressive from level
+ // 0 to 2. The following table indicates the number of iterations w.r.t bsize:
+ // -----------------------------------------------
+ // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 |
+ // | 0 | 4 | 4 | 4 |
+ // | 1 | 0 | 2 | 4 |
+ // | 2 | 0 | 0 | 0 |
+ // -----------------------------------------------
+ // Here, 0 iterations indicate using the best single motion vector selected
+ // for each ref frame without any iterative refinement.
+ int comp_inter_joint_search_iter_level;
// This variable is used to cap the maximum number of times we skip testing a
// mode to be evaluated. A high value means we will be faster.
@@ -292,8 +323,8 @@ typedef struct SPEED_FEATURES {
int coeff_prob_appx_step;
// Enable uniform quantizer followed by trellis coefficient optimization
- int allow_quant_coeff_opt;
- double quant_opt_thresh;
+ // during transform RD
+ TRELLIS_OPT_CONTROL trellis_opt_tx_rd;
// Enable asymptotic closed-loop encoding decision for key frame and
// alternate reference frames.
@@ -399,9 +430,21 @@ typedef struct SPEED_FEATURES {
// Adaptive prediction mode search
int adaptive_mode_search;
- // Chessboard pattern prediction filter type search
+ // Prune NEAREST and ZEROMV single reference modes based on motion vector
+ // difference and mode rate
+ int prune_single_mode_based_on_mv_diff_mode_rate;
+
+ // Chessboard pattern prediction for interp filter. Aggressiveness increases
+ // with levels.
+ // 0: disable
+ // 1: cb pattern in eval when filter is not switchable
+ // 2: cb pattern prediction for filter search
int cb_pred_filter_search;
+ // This variable enables an early termination of interpolation filter eval
+ // based on the current rd cost after processing each plane
+ int early_term_interp_search_plane_rd;
+
int cb_partition_search;
int motion_field_mode_search;
@@ -600,7 +643,7 @@ typedef struct SPEED_FEATURES {
// Use machine learning based partition search.
int nonrd_use_ml_partition;
- // Multiplier for base thresold for variance partitioning.
+ // Multiplier for base threshold for variance partitioning.
int variance_part_thresh_mult;
// Force subpel motion filter to always use SMOOTH_FILTER.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 7e9435fb5..fff6d25de 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -107,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
LAYER_CONTEXT *const lc = &svc->layer_context[layer];
RATE_CONTROL *const lrc = &lc->rc;
- int i;
lc->current_video_frame_in_layer = 0;
lc->layer_size = 0;
lc->frames_from_key_frame = 0;
@@ -164,17 +163,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
lc->actual_num_seg1_blocks = 0;
lc->actual_num_seg2_blocks = 0;
lc->counter_encode_maxq_scene_change = 0;
- CHECK_MEM_ERROR(cm, lc->map,
+ CHECK_MEM_ERROR(&cm->error, lc->map,
vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
memset(lc->map, 0, mi_rows * mi_cols);
last_coded_q_map_size =
mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
- CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+ CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
vpx_malloc(last_coded_q_map_size));
assert(MAXQ <= 255);
memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
- CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+ CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
vpx_malloc(consec_zero_mv_size));
memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
}
@@ -220,18 +219,21 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
RATE_CONTROL *const lrc = &lc->rc;
lc->spatial_layer_target_bandwidth = spatial_layer_target;
- bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ if (target_bandwidth != 0) {
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ }
lrc->starting_buffer_level =
- (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+ (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5);
lrc->optimal_buffer_level =
- (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+ (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5);
lrc->maximum_buffer_size =
- (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+ (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5);
lrc->bits_off_target =
VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
- lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->avg_frame_bandwidth =
+ (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
lrc->worst_quality = rc->worst_quality;
lrc->best_quality = rc->best_quality;
@@ -252,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
- bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ if (target_bandwidth != 0) {
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ }
// Update buffer-related quantities.
lrc->starting_buffer_level =
(int64_t)(rc->starting_buffer_level * bitrate_alloc);
@@ -269,7 +273,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
} else {
lc->framerate = cpi->framerate;
}
- lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->avg_frame_bandwidth =
+ (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
// Update qp-related quantities.
lrc->worst_quality = rc->worst_quality;
@@ -311,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
const int tl = svc->temporal_layer_id;
lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
- lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->avg_frame_bandwidth =
+ (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
// Update the average layer frame size (non-cumulative per-frame-bw).
if (tl == 0) {
@@ -333,7 +339,8 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
RATE_CONTROL *const lrc = &lc->rc;
lc->framerate = framerate;
- lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->avg_frame_bandwidth =
+ (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
lrc->min_frame_bandwidth =
(int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
@@ -389,6 +396,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
lc->twopass = cpi->twopass;
lc->target_bandwidth = (int)oxcf->target_bandwidth;
lc->alt_ref_source = cpi->alt_ref_source;
+ lc->frame_qp = cpi->common.base_qindex;
+ lc->MBs = cpi->common.MBs;
// For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
// for the base temporal layer.
@@ -408,6 +417,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+ lc->qindex_delta[0] = cr->qindex_delta[0];
+ lc->qindex_delta[1] = cr->qindex_delta[1];
+ lc->qindex_delta[2] = cr->qindex_delta[2];
}
}
@@ -790,9 +802,9 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
for (sl = svc->number_spatial_layers - 1;
sl >= svc->first_spatial_layer_to_encode; sl--) {
int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id;
- LAYER_CONTEXT *const lc = &svc->layer_context[layer];
- cpi->rc = lc->rc;
- cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+ LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer];
+ cpi->rc = sl_lc->rc;
+ cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth;
if (vp9_test_drop(cpi)) {
int sl2;
// Set flag to force drop in encoding for this mode.
@@ -1041,17 +1053,17 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
int sl, tl;
for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
// Check for reset based on avg_frame_bandwidth for spatial layer sl.
- int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
- svc->number_temporal_layers);
- LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ const int spatial_layer_idx = LAYER_IDS_TO_IDX(
+ sl, svc->number_temporal_layers - 1, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx];
RATE_CONTROL *lrc = &lc->rc;
if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
// Reset for all temporal layers with spatial layer sl.
for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
- int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
- LAYER_CONTEXT *lc = &svc->layer_context[layer];
- RATE_CONTROL *lrc = &lc->rc;
+ int temporal_layer_idx =
+ LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ lrc = &svc->layer_context[temporal_layer_idx].rc;
lrc->rc_1_frame = 0;
lrc->rc_2_frame = 0;
lrc->bits_off_target = lrc->optimal_buffer_level;
@@ -1137,7 +1149,7 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
SVC *const svc = &cpi->svc;
// For fixed/non-flexible mode, the following constraint are expected,
- // when inter-layer prediciton is on (default).
+ // when inter-layer prediction is on (default).
if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
svc->framedrop_mode != LAYER_DROP) {
@@ -1338,3 +1350,27 @@ void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
}
}
}
+
+// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+// No need to set svc.skip_enhancement_layer if whole superframe will be
+// dropped.
+int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) {
+ if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+ cpi->oxcf.target_bandwidth == 0 &&
+ !(cpi->svc.framedrop_mode != LAYER_DROP &&
+ (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
+ cpi->svc
+ .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
+ 1]) &&
+ cpi->svc.drop_spatial_layer[0])) {
+ cpi->svc.skip_enhancement_layer = 1;
+ vp9_rc_postencode_update_drop_frame(cpi);
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->last_frame_dropped = 1;
+ cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+ cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+ vp9_inc_frame_in_layer(cpi);
+ return 1;
+ }
+ return 0;
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index c7328cf57..388a02789 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,8 +70,11 @@ typedef struct {
int actual_num_seg1_blocks;
int actual_num_seg2_blocks;
int counter_encode_maxq_scene_change;
+ int qindex_delta[3];
uint8_t speed;
int loopfilter_ctrl;
+ int frame_qp;
+ int MBs;
} LAYER_CONTEXT;
typedef struct SVC {
@@ -278,6 +281,8 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
+
+int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 8af30c42a..986553a4a 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -450,8 +450,6 @@ void vp9_highbd_apply_temporal_filter_c(
// Apply the filter to luma
for (row = 0; row < (int)block_height; row++) {
for (col = 0; col < (int)block_width; col++) {
- const int uv_row = row >> ss_y;
- const int uv_col = col >> ss_x;
const int filter_weight = get_filter_weight(
row, col, block_height, block_width, blk_fw, use_32x32);
@@ -476,6 +474,8 @@ void vp9_highbd_apply_temporal_filter_c(
// Sum the corresponding uv pixels to the current y modifier
// Note we are rounding down instead of rounding to the nearest pixel.
+ uv_row = row >> ss_y;
+ uv_col = col >> ss_x;
y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/vp9_temporal_filter_constants.h
index 7dcedda19..8776dfc06 100644
--- a/vp9/encoder/x86/temporal_filter_constants.h
+++ b/vp9/encoder/vp9_temporal_filter_constants.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
#include "./vpx_config.h"
// Division using multiplication and shifting. The C implementation does:
@@ -407,4 +407,4 @@ static const uint32_t
#define DIST_STRIDE ((BW) + 2)
-#endif // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#endif // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 814d769be..6c6c04493 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col,
const PLANE_TYPE type = get_plane_type(plane);
const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
const int16_t *scan, *nb;
- const scan_order *so;
+ const ScanOrder *so;
const int ref = is_inter_block(mi);
unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
td->rd_counts.coef_counts[tx_size][type][ref];
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
new file mode 100644
index 000000000..b8910370e
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -0,0 +1,1541 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_codec.h"
+
+static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+ const GF_GROUP *gf_group, int *tpl_group_frames) {
+ VP9_COMMON *cm = &cpi->common;
+ int frame_idx = 0;
+ int i;
+ int gld_index = -1;
+ int alt_index = -1;
+ int lst_index = -1;
+ int arf_index_stack[MAX_ARF_LAYERS];
+ int arf_stack_size = 0;
+ int extend_frame_count = 0;
+ int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+ int frame_gop_offset = 0;
+
+ RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+ int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+ memset(recon_frame_index, -1, sizeof(recon_frame_index));
+ stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (frame_bufs[i].ref_count == 0) {
+ alloc_frame_mvs(cm, i);
+ if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ recon_frame_index[frame_idx] = i;
+ ++frame_idx;
+
+ if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+ }
+ }
+
+ for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+ assert(recon_frame_index[i] >= 0);
+ cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+ }
+
+ *tpl_group_frames = 0;
+
+ // Initialize Golden reference frame.
+ gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
+ gf_picture[0].update_type = gf_group->update_type[0];
+ gld_index = 0;
+ ++*tpl_group_frames;
+
+ // Initialize base layer ARF frame
+ gf_picture[1].frame = cpi->Source;
+ gf_picture[1].ref_frame[0] = gld_index;
+ gf_picture[1].ref_frame[1] = lst_index;
+ gf_picture[1].ref_frame[2] = alt_index;
+ gf_picture[1].update_type = gf_group->update_type[1];
+ alt_index = 1;
+ ++*tpl_group_frames;
+
+ // Initialize P frames
+ for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+ struct lookahead_entry *buf;
+ frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+ buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+ if (buf == NULL) break;
+
+ gf_picture[frame_idx].frame = &buf->img;
+ gf_picture[frame_idx].ref_frame[0] = gld_index;
+ gf_picture[frame_idx].ref_frame[1] = lst_index;
+ gf_picture[frame_idx].ref_frame[2] = alt_index;
+ gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+ switch (gf_group->update_type[frame_idx]) {
+ case ARF_UPDATE:
+ stack_push(arf_index_stack, alt_index, arf_stack_size);
+ ++arf_stack_size;
+ alt_index = frame_idx;
+ break;
+ case LF_UPDATE: lst_index = frame_idx; break;
+ case OVERLAY_UPDATE:
+ gld_index = frame_idx;
+ alt_index = stack_pop(arf_index_stack, arf_stack_size);
+ --arf_stack_size;
+ break;
+ case USE_BUF_FRAME:
+ lst_index = alt_index;
+ alt_index = stack_pop(arf_index_stack, arf_stack_size);
+ --arf_stack_size;
+ break;
+ default: break;
+ }
+
+ ++*tpl_group_frames;
+
+ // The length of group of pictures is baseline_gf_interval, plus the
+ // beginning golden frame from last GOP, plus the last overlay frame in
+ // the same GOP.
+ if (frame_idx == gf_group->gf_group_size) break;
+ }
+
+ alt_index = -1;
+ ++frame_idx;
+ ++frame_gop_offset;
+
+ // Extend two frames outside the current gf group.
+ for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+ struct lookahead_entry *buf =
+ vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+ if (buf == NULL) break;
+
+ cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+ gf_picture[frame_idx].frame = &buf->img;
+ gf_picture[frame_idx].ref_frame[0] = gld_index;
+ gf_picture[frame_idx].ref_frame[1] = lst_index;
+ gf_picture[frame_idx].ref_frame[2] = alt_index;
+ gf_picture[frame_idx].update_type = LF_UPDATE;
+ lst_index = frame_idx;
+ ++*tpl_group_frames;
+ ++extend_frame_count;
+ ++frame_gop_offset;
+ }
+
+ return extend_frame_count;
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+ int frame_idx;
+ for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ memset(tpl_frame->tpl_stats_ptr, 0,
+ tpl_frame->height * tpl_frame->width *
+ sizeof(*tpl_frame->tpl_stats_ptr));
+ tpl_frame->is_valid = 0;
+ }
+}
+
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+ int frame_idx;
+ for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+ vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+ }
+ vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+ struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+ TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width,
+ int frame_height) {
+ int frame_idx;
+ free_tpl_frame_stats_list(tpl_gop_stats);
+ CHECK_MEM_ERROR(
+ error_info, tpl_gop_stats->frame_stats_list,
+ vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+ tpl_gop_stats->size = tpl_gop_frames;
+ for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+ const int mi_rows = tpl_stats[frame_idx].height;
+ const int mi_cols = tpl_stats[frame_idx].width;
+ CHECK_MEM_ERROR(
+ error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+ vpx_calloc(
+ mi_rows * mi_cols,
+ sizeof(
+ *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+ tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+ tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width;
+ tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height;
+ }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+ MotionField *motion_field,
+ int frame_idx, uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf, int stride,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, MV *mv) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ int step_param;
+ uint32_t bestsme = UINT_MAX;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ // lambda is used to adjust the importance of motion vector consistency.
+ // TODO(angiebird): Figure out lambda's proper value.
+ const int lambda = cpi->tpl_stats[frame_idx].lambda;
+ int_mv nb_full_mvs[NB_MVS_NUM];
+ int nb_full_mv_num;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ nb_full_mv_num =
+ vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+ vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+ lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ return bestsme;
+}
+
+static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf, int stride,
+ BLOCK_SIZE bsize, MV *mv) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ uint32_t bestsme = UINT_MAX;
+ uint32_t distortion;
+ uint32_t sse;
+ int cost_list[5];
+
+ MV best_ref_mv1 = { 0, 0 };
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ // TODO(yunqing): may use higher tap interp filter than 2 taps.
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ bestsme = cpi->find_fractional_mv_step(
+ x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ USE_2_TAPS);
+
+ return bestsme;
+}
+
+#else // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf,
+ int stride, BLOCK_SIZE bsize,
+ MV *mv) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const SEARCH_METHODS search_method = NSTEP;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ uint32_t bestsme = UINT_MAX;
+ uint32_t distortion;
+ uint32_t sse;
+ int cost_list[5];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+ search_method, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, mv, 0, 0);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ // TODO(yunqing): may use higher tap interp filter than 2 taps.
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ bestsme = cpi->find_fractional_mv_step(
+ x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ USE_2_TAPS);
+
+ return bestsme;
+}
+#endif
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+ int ref_pos_col, int block, BLOCK_SIZE bsize) {
+ int width = 0, height = 0;
+ int bw = 4 << b_width_log2_lookup[bsize];
+ int bh = 4 << b_height_log2_lookup[bsize];
+
+ switch (block) {
+ case 0:
+ width = grid_pos_col + bw - ref_pos_col;
+ height = grid_pos_row + bh - ref_pos_row;
+ break;
+ case 1:
+ width = ref_pos_col + bw - grid_pos_col;
+ height = grid_pos_row + bh - ref_pos_row;
+ break;
+ case 2:
+ width = grid_pos_col + bw - ref_pos_col;
+ height = ref_pos_row + bh - grid_pos_row;
+ break;
+ case 3:
+ width = ref_pos_col + bw - grid_pos_col;
+ height = ref_pos_row + bh - grid_pos_row;
+ break;
+ default: assert(0);
+ }
+
+ return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+ int round;
+ if (ref_pos < 0)
+ round = -(1 + (-ref_pos - 1) / bsize_pix);
+ else
+ round = ref_pos / bsize_pix;
+
+ return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int stride) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+ const int64_t mc_flow = tpl_ptr->mc_flow;
+ const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+ *tpl_ptr = *src_stats;
+ tpl_ptr->mc_flow = mc_flow;
+ tpl_ptr->mc_ref_cost = mc_ref_cost;
+ tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+ }
+ }
+}
+
+static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
+ TplDepStats *tpl_stats, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int stride, int64_t recon_error,
+ int64_t rate_cost, int ref_frame_idx) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ VpxTplBlockStats *tpl_block_stats_ptr =
+ &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
+ tpl_block_stats_ptr->row = mi_row * 8;
+ tpl_block_stats_ptr->col = mi_col * 8;
+ tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
+ tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+ tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+ tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
+ tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
+ tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
+ }
+ }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+ int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+ TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+ TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+ MV mv = tpl_stats->mv.as_mv;
+ int mv_row = mv.row >> 3;
+ int mv_col = mv.col >> 3;
+
+ int ref_pos_row = mi_row * MI_SIZE + mv_row;
+ int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int pix_num = bw * bh;
+
+ // top-left on grid block location in pixel
+ int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+ int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+ int block;
+
+ for (block = 0; block < 4; ++block) {
+ int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+ int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+ if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+ grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+ int overlap_area = get_overlap_area(
+ grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+ int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+ int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+ int64_t mc_flow = tpl_stats->mc_dep_cost -
+ (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+ tpl_stats->intra_cost;
+
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *des_stats =
+ &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+ (ref_mi_col + idx)];
+
+ des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+ des_stats->mc_ref_cost +=
+ ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+ pix_num;
+ assert(overlap_area >= 0);
+ }
+ }
+ }
+ }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+ int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+ int idx, idy;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *tpl_ptr =
+ &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+ tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+ BLOCK_8X8);
+ }
+ }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ TX_SIZE tx_size, int64_t *recon_error,
+ int64_t *sse, uint16_t *eob) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff,
+ pd->dequant, eob, scan_order);
+ } else {
+ vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
+ }
+#else
+ vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+ *recon_error = VPXMAX(*recon_error, 1);
+
+ *sse = (*sse) >> shift;
+ *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size) {
+ // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+ switch (tx_size) {
+ case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+ case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+ case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+ default: assert(0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size) {
+ switch (tx_size) {
+ case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+ case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+ case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+ default: assert(0);
+ }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+ int mi_col) {
+ x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+ x->mv_limits.row_max =
+ (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+ x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+ x->mv_limits.col_max =
+ ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+ const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+ int rate_cost = 1;
+ int idx;
+ assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+ for (idx = 0; idx < eob; ++idx) {
+ unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+ rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+ }
+
+ return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ struct scale_factors *sf, GF_PICTURE *gf_picture,
+ int frame_idx, TplDepFrame *tpl_frame,
+ int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+ int64_t *recon_error, int64_t *rate_cost,
+ int64_t *sse, int *ref_frame_idx) {
+ VP9_COMMON *cm = &cpi->common;
+ ThreadData *td = &cpi->td;
+
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int pix_num = bw * bh;
+ int best_rf_idx = -1;
+ int_mv best_mv;
+ int64_t best_inter_cost = INT64_MAX;
+ int64_t inter_cost;
+ int rf_idx;
+ const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+ int64_t best_intra_cost = INT64_MAX;
+ int64_t intra_cost;
+ PREDICTION_MODE mode;
+ int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ MODE_INFO mi_above, mi_left;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ TplDepStats *tpl_stats =
+ &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+ xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+ xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+ // Intra prediction search
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ uint8_t *src, *dst;
+ int src_stride, dst_stride;
+
+ src = xd->cur_buf->y_buffer + mb_y_offset;
+ src_stride = xd->cur_buf->y_stride;
+
+ dst = &predictor[0];
+ dst_stride = bw;
+
+ xd->mi[0]->sb_type = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+ vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+ src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+ dst_stride, xd->bd);
+ vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ intra_cost = vpx_highbd_satd(coeff, pix_num);
+ } else {
+ vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+ dst_stride);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ intra_cost = vpx_satd(coeff, pix_num);
+ }
+#else
+ vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ intra_cost = vpx_satd(coeff, pix_num);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+ }
+
+ // Motion compensated prediction
+ best_mv.as_int = 0;
+
+ set_mv_limits(cm, x, mi_row, mi_col);
+
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ int_mv mv;
+#if CONFIG_NON_GREEDY_MV
+ MotionField *motion_field;
+#endif
+ if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+ (void)td;
+ motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+ mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+#else
+ motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, bsize, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+ ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+ &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd->bd);
+ vpx_highbd_subtract_block(
+ bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+ vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ inter_cost = vpx_highbd_satd(coeff, pix_num);
+ } else {
+ vp9_build_inter_predictor(
+ ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+ 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+ vpx_subtract_block(bh, bw, src_diff, bw,
+ xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ inter_cost = vpx_satd(coeff, pix_num);
+ }
+#else
+ vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+ &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+ vpx_subtract_block(bh, bw, src_diff, bw,
+ xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+ if (inter_cost < best_inter_cost) {
+ uint16_t eob = 0;
+ best_rf_idx = rf_idx;
+ best_inter_cost = inter_cost;
+ best_mv.as_int = mv.as_int;
+ // Since best_inter_cost is initialized as INT64_MAX, recon_error and
+ // rate_cost will be calculated with the best reference frame.
+ get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+ sse, &eob);
+ *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+ }
+ }
+ best_intra_cost = VPXMAX(best_intra_cost, 1);
+ best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+ tpl_stats->inter_cost = VPXMAX(
+ 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+ tpl_stats->intra_cost = VPXMAX(
+ 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+ tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ tpl_stats->mv.as_int = best_mv.as_int;
+ *ref_frame_idx = best_rf_idx;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+ int frame_idx, int rf_idx, int mi_row,
+ int mi_col, struct buf_2d *src,
+ struct buf_2d *pre) {
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ YV12_BUFFER_CONFIG *ref_frame = NULL;
+ int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+ if (ref_frame_idx != -1) {
+ ref_frame = gf_picture[ref_frame_idx].frame;
+ src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+ src->stride = xd->cur_buf->y_stride;
+ pre->buf = ref_frame->y_buffer + mb_y_offset;
+ pre->stride = ref_frame->y_stride;
+ assert(src->stride == pre->stride);
+ return 1;
+ } else {
+ printf("invalid ref_frame_idx");
+ assert(ref_frame_idx != -1);
+ return 0;
+ }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+ { -1, 0 },
+ { 0, -1 },
+ { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+ int mi_col) {
+ return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ int i;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int_mv nearest_mv, near_mv, invalid_mv;
+ nearest_mv.as_int = INVALID_MV;
+ near_mv.as_int = INVALID_MV;
+ invalid_mv.as_int = INVALID_MV;
+ for (i = 0; i < MV_REF_POS_NUM; ++i) {
+ int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+ int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+ assert(mv_ref_pos[i].row <= 0);
+ assert(mv_ref_pos[i].col <= 0);
+ if (nb_row >= 0 && nb_col >= 0) {
+ if (nearest_mv.as_int == INVALID_MV) {
+ nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+ } else {
+ int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+ if (mv.as_int == nearest_mv.as_int) {
+ continue;
+ } else {
+ near_mv = mv;
+ break;
+ }
+ }
+ }
+ }
+ if (nearest_mv.as_int == INVALID_MV) {
+ nearest_mv.as_mv.row = 0;
+ nearest_mv.as_mv.col = 0;
+ }
+ if (near_mv.as_int == INVALID_MV) {
+ near_mv.as_mv.row = 0;
+ near_mv.as_mv.col = 0;
+ }
+ if (mv_mode == NEAREST_MV_MODE) {
+ return nearest_mv;
+ }
+ if (mv_mode == NEAR_MV_MODE) {
+ return near_mv;
+ }
+ assert(0);
+ return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+ MotionField *motion_field,
+ TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ int_mv mv;
+ switch (mv_mode) {
+ case ZERO_MV_MODE:
+ mv.as_mv.row = 0;
+ mv.as_mv.col = 0;
+ break;
+ case NEW_MV_MODE:
+ mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+ break;
+ case NEAREST_MV_MODE:
+ mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+ break;
+ case NEAR_MV_MODE:
+ mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+ break;
+ default:
+ mv.as_int = INVALID_MV;
+ assert(0);
+ break;
+ }
+ return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+ GF_PICTURE *gf_picture, MotionField *motion_field,
+ int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *mv) {
+ uint32_t sse;
+ struct buf_2d src;
+ struct buf_2d pre;
+ MV full_mv;
+ *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
+ mi_row, mi_col);
+ full_mv = get_full_mv(&mv->as_mv);
+ if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+ &src, &pre)) {
+ // TODO(angiebird): Consider subpixel when computing the sse.
+ cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+ pre.stride, &sse);
+ return (double)(sse << VP9_DIST_SCALE_LOG2);
+ } else {
+ assert(0);
+ return 0;
+ }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+ // TODO(angiebird): The probabilities are roughly inferred from
+ // default_inter_mode_probs. Check if there is a better way to set the
+ // probabilities.
+ const int zero_mv_prob = 16;
+ const int new_mv_prob = 24 * 1;
+ const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+ assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+ switch (mv_mode) {
+ case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+ case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+ case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+ case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+ default: assert(0); return -1;
+ }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+ double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+ log2(1 + abs(new_mv->col - ref_mv->col));
+ mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+ return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
+ TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ double mv_cost = get_mv_mode_cost(mv_mode);
+ if (mv_mode == NEW_MV_MODE) {
+ MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
+ bsize, mi_row, mi_col)
+ .as_mv;
+ MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
+ tpl_frame, bsize, mi_row, mi_col)
+ .as_mv;
+ MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
+ bsize, mi_row, mi_col)
+ .as_mv;
+ double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+ double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+ mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+ }
+ return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture, MotionField *motion_field,
+ int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ double mv_dist =
+ get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
+ tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+ double mv_cost =
+ get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
+ double mult = 180;
+
+ return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture,
+ MotionField *motion_field, int frame_idx,
+ TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ double *rd, int_mv *mv) {
+ int best_mv_mode = ZERO_MV_MODE;
+ int update = 0;
+ int mv_mode;
+ *rd = 0;
+ for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+ double this_rd;
+ int_mv this_mv;
+ if (mv_mode == NEW_MV_MODE) {
+ continue;
+ }
+ this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
+ tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
+ if (update == 0) {
+ *rd = this_rd;
+ *mv = this_mv;
+ best_mv_mode = mv_mode;
+ update = 1;
+ } else {
+ if (this_rd < *rd) {
+ *rd = this_rd;
+ *mv = this_mv;
+ best_mv_mode = mv_mode;
+ }
+ }
+ }
+ return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture, MotionField *motion_field,
+ int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int tmp_mv_mode_arr[kMvPreCheckSize];
+ int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+ double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+ int_mv *select_mv_arr = cpi->select_mv_arr;
+ int_mv tmp_select_mv_arr[kMvPreCheckSize];
+ int stride = tpl_frame->stride;
+ double new_mv_rd = 0;
+ double no_new_mv_rd = 0;
+ double this_new_mv_rd = 0;
+ double this_no_new_mv_rd = 0;
+ int idx;
+ int tmp_idx;
+ assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+ // no new mv
+ // diagonal scan order
+ tmp_idx = 0;
+ for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+ int r;
+ for (r = 0; r <= idx; ++r) {
+ int c = idx - r;
+ int nb_row = mi_row + r * mi_height;
+ int nb_col = mi_col + c * mi_width;
+ if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+ double this_rd;
+ int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+ mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+ cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+ bsize, nb_row, nb_col, &this_rd, mv);
+ if (r == 0 && c == 0) {
+ this_no_new_mv_rd = this_rd;
+ }
+ no_new_mv_rd += this_rd;
+ tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+ tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+ ++tmp_idx;
+ }
+ }
+ }
+
+ // new mv
+ mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+ this_new_mv_rd = eval_mv_mode(
+ NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+ rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
+ new_mv_rd = this_new_mv_rd;
+ // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+ // beforehand.
+ for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+ int r;
+ for (r = 0; r <= idx; ++r) {
+ int c = idx - r;
+ int nb_row = mi_row + r * mi_height;
+ int nb_col = mi_col + c * mi_width;
+ if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+ double this_rd;
+ int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+ mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+ cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+ bsize, nb_row, nb_col, &this_rd, mv);
+ new_mv_rd += this_rd;
+ }
+ }
+ }
+
+ // update best_mv_mode
+ tmp_idx = 0;
+ if (no_new_mv_rd < new_mv_rd) {
+ for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+ int r;
+ for (r = 0; r <= idx; ++r) {
+ int c = idx - r;
+ int nb_row = mi_row + r * mi_height;
+ int nb_col = mi_col + c * mi_width;
+ if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+ mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+ select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+ ++tmp_idx;
+ }
+ }
+ }
+ rd_diff_arr[mi_row * stride + mi_col] = 0;
+ } else {
+ rd_diff_arr[mi_row * stride + mi_col] =
+ (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+ }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture,
+ MotionField *motion_field, int frame_idx,
+ TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int unit_rows = tpl_frame->mi_rows / mi_height;
+ const int unit_cols = tpl_frame->mi_cols / mi_width;
+ const int max_diagonal_lines = unit_rows + unit_cols - 1;
+ int idx;
+ for (idx = 0; idx < max_diagonal_lines; ++idx) {
+ int r;
+ for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+ ++r) {
+ int c = idx - r;
+ int mi_row = r * mi_height;
+ int mi_col = c * mi_width;
+ assert(c >= 0 && c < unit_cols);
+ assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+ assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+ predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+ rf_idx, bsize, mi_row, mi_col);
+ }
+ }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
+ MotionField *motion_field, int frame_idx,
+ YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ assert(ref_frame != NULL);
+ set_mv_limits(cm, x, mi_row, mi_col);
+ {
+ int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+ uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
+ uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
+ const int stride = xd->cur_buf->y_stride;
+ full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
+ ref_frame_buf, stride, bsize, mi_row, mi_col,
+ &mv.as_mv);
+ sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
+ bsize, &mv.as_mv);
+ vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
+ }
+}
+
+static void build_motion_field(
+ VP9_COMP *cpi, int frame_idx,
+ YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
+ VP9_COMMON *cm = &cpi->common;
+ ThreadData *td = &cpi->td;
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+ const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+ int mi_row, mi_col;
+ int rf_idx;
+
+ tpl_frame->lambda = (pw * ph) >> 2;
+ assert(pw * ph == tpl_frame->lambda << 2);
+
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+ if (ref_frame[rf_idx] == NULL) {
+ continue;
+ }
+ vp9_motion_field_reset_mvs(motion_field);
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
+ bsize, mi_row, mi_col);
+ }
+ }
+ }
+}
+#endif // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+ int frame_idx, BLOCK_SIZE bsize) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ VpxTplFrameStats *tpl_frame_stats_before_propagation =
+ &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
+ YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+ YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
+
+ VP9_COMMON *cm = &cpi->common;
+ struct scale_factors sf;
+ int rdmult, idx;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+ DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+ uint8_t *predictor;
+#else
+ DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+ DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+ tpl_frame_stats_before_propagation->frame_width = cm->width;
+ tpl_frame_stats_before_propagation->frame_height = cm->height;
+ // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(
+ &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height,
+ cpi->common.use_highbitdepth);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ else
+ predictor = predictor8;
+#else
+ vp9_setup_scale_factors_for_frame(
+ &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Prepare reference frame pointers. If any reference frame slot is
+ // unavailable, the pointer will be set to Null.
+ for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
+ int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+ if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ xd->cur_buf = this_frame;
+
+ // Get rd multiplier set up.
+ rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+ set_error_per_bit(&cpi->td.mb, rdmult);
+ vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+ tpl_frame->is_valid = 1;
+
+ cm->base_qindex = tpl_frame->base_qindex;
+ vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+ {
+ int square_block_idx;
+ int rf_idx;
+ for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+ ++square_block_idx) {
+ BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+ build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+ }
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+ if (ref_frame_idx != -1) {
+ MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+ predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+ tpl_frame, rf_idx, bsize);
+ }
+ }
+ }
+#endif // CONFIG_NON_GREEDY_MV
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ int64_t recon_error = 0;
+ int64_t rate_cost = 0;
+ int64_t sse = 0;
+ // Ref frame index in the ref frame buffer.
+ int ref_frame_idx = -1;
+ mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+ src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+ tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+ &sse, &ref_frame_idx);
+ // Motion flow dependency dispenser.
+ tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+ tpl_frame->stride);
+
+ tpl_store_before_propagation(
+ tpl_frame_stats_before_propagation->block_stats_list,
+ tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+ recon_error, rate_cost, ref_frame_idx);
+
+ tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+ bsize);
+ }
+ }
+}
+
+static void trim_tpl_stats(struct vpx_internal_error_info *error_info,
+ VpxTplGopStats *tpl_gop_stats, int extra_frames) {
+ int i;
+ VpxTplFrameStats *new_frame_stats;
+ const int new_size = tpl_gop_stats->size - extra_frames;
+ if (tpl_gop_stats->size <= extra_frames)
+ vpx_internal_error(
+ error_info, VPX_CODEC_ERROR,
+ "The number of frames in VpxTplGopStats is fewer than expected.");
+ CHECK_MEM_ERROR(error_info, new_frame_stats,
+ vpx_calloc(new_size, sizeof(*new_frame_stats)));
+ for (i = 0; i < new_size; i++) {
+ VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+ const int num_blocks = frame_stats->num_blocks;
+ new_frame_stats[i].num_blocks = frame_stats->num_blocks;
+ new_frame_stats[i].frame_width = frame_stats->frame_width;
+ new_frame_stats[i].frame_height = frame_stats->frame_height;
+ new_frame_stats[i].num_blocks = num_blocks;
+ CHECK_MEM_ERROR(
+ error_info, new_frame_stats[i].block_stats_list,
+ vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list)));
+ memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list,
+ num_blocks * sizeof(*new_frame_stats[i].block_stats_list));
+ }
+ free_tpl_frame_stats_list(tpl_gop_stats);
+ tpl_gop_stats->size = new_size;
+ tpl_gop_stats->frame_stats_list = new_frame_stats;
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+ int i, j;
+ printf("%d %d\n", h, w);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ printf("%d ", buf[(row + i) * stride + col + j]);
+ }
+ }
+ printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+ dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+ frame_buf->y_width);
+ dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+ frame_buf->uv_height, frame_buf->uv_width);
+ dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+ frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+ const GF_GROUP *gf_group,
+ const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+ int frame_idx;
+ const VP9_COMMON *cm = &cpi->common;
+ int rf_idx;
+ for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ int mi_row, mi_col;
+ int ref_frame_idx;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+ if (ref_frame_idx != -1) {
+ YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+ const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+ const int ref_gf_frame_offset =
+ gf_group->frame_gop_index[ref_frame_idx];
+ printf("=\n");
+ printf(
+ "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+ "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+ frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+ ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+ for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+ for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+ if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+ int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
+ frame_idx, rf_idx, bsize,
+ mi_row, mi_col);
+ printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+ mv.as_mv.col);
+ }
+ }
+ }
+ for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+ for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+ if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+ const TplDepStats *tpl_ptr =
+ &tpl_frame
+ ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+ printf("%f ", tpl_ptr->feature_score);
+ }
+ }
+ }
+ printf("\n");
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ const int mv_mode =
+ tpl_frame
+ ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+ printf("%d ", mv_mode);
+ }
+ }
+ printf("\n");
+
+ dump_frame_buf(gf_picture[frame_idx].frame);
+ dump_frame_buf(ref_frame_buf);
+ }
+ }
+ }
+}
+#endif // DUMP_TPL_STATS
+#endif // CONFIG_NON_GREEDY_MV
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int frame;
+
+ const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+ int rf_idx;
+
+ vpx_free(cpi->select_mv_arr);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->select_mv_arr,
+ vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+ // TODO(jingning): Reduce the actual memory use for tpl model build up.
+ for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+ if (cpi->tpl_stats[frame].width >= mi_cols &&
+ cpi->tpl_stats[frame].height >= mi_rows &&
+ cpi->tpl_stats[frame].tpl_stats_ptr)
+ continue;
+
+#if CONFIG_NON_GREEDY_MV
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+ vpx_calloc(mi_rows * mi_cols * 4,
+ sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+ vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+ vpx_calloc(mi_rows * mi_cols * 4,
+ sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+ }
+#endif
+ vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+ CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
+ vpx_calloc(mi_rows * mi_cols,
+ sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+ cpi->tpl_stats[frame].is_valid = 0;
+ cpi->tpl_stats[frame].width = mi_cols;
+ cpi->tpl_stats[frame].height = mi_rows;
+ cpi->tpl_stats[frame].stride = mi_cols;
+ cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+ cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+ }
+
+ for (frame = 0; frame < REF_FRAMES; ++frame) {
+ cpi->enc_frame_buf[frame].mem_valid = 0;
+ cpi->enc_frame_buf[frame].released = 1;
+ }
+}
+
+void vp9_free_tpl_buffer(VP9_COMP *cpi) {
+ int frame;
+#if CONFIG_NON_GREEDY_MV
+ vp9_free_motion_field_info(&cpi->motion_field_info);
+ vpx_free(cpi->select_mv_arr);
+#endif
+ for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+ int rf_idx;
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+ vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+ }
+#endif
+ vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+ cpi->tpl_stats[frame].is_valid = 0;
+ }
+ free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
+}
+
+#if CONFIG_RATE_CTRL
+static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int show_frame_count = 0;
+ int frame_idx;
+ // Accumulate tpl stats for each frame in the current group of picture.
+ for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int tpl_stride = tpl_frame->stride;
+ int64_t intra_cost_base = 0;
+ int64_t inter_cost_base = 0;
+ int64_t mc_dep_cost_base = 0;
+ int64_t mc_ref_cost_base = 0;
+ int64_t mc_flow_base = 0;
+ int row, col;
+
+ if (!tpl_frame->is_valid) continue;
+
+ for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+ for (col = 0; col < cm->mi_cols; ++col) {
+ TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+ intra_cost_base += this_stats->intra_cost;
+ inter_cost_base += this_stats->inter_cost;
+ mc_dep_cost_base += this_stats->mc_dep_cost;
+ mc_ref_cost_base += this_stats->mc_ref_cost;
+ mc_flow_base += this_stats->mc_flow;
+ }
+ }
+
+ cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
+ cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
+ cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
+ cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
+ cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
+
+ ++show_frame_count;
+ }
+}
+#endif // CONFIG_RATE_CTRL
+
+void vp9_setup_tpl_stats(VP9_COMP *cpi) {
+ GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int tpl_group_frames = 0;
+ int frame_idx;
+ int extended_frame_count;
+ cpi->tpl_bsize = BLOCK_32X32;
+
+ extended_frame_count =
+ init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+ init_tpl_stats(cpi);
+
+ init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+ cpi->tpl_stats, tpl_group_frames,
+ cpi->common.width, cpi->common.height);
+
+ // Backward propagation from tpl_group_frames to 1.
+ for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+ if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+ mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+ }
+
+ // TPL stats has extra frames from next GOP. Trim those extra frames for
+ // Qmode.
+ trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
+
+ if (cpi->ext_ratectrl.ready &&
+ cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+ const vpx_codec_err_t codec_status =
+ vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cpi->common.error, codec_status,
+ "vp9_extrc_send_tpl_stats() failed");
+ }
+ }
+
+#if CONFIG_NON_GREEDY_MV
+ cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+ dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif // DUMP_TPL_STATS
+#endif // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ accumulate_frame_tpl_stats(cpi);
+ }
+#endif // CONFIG_RATE_CTRL
+}
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
new file mode 100644
index 000000000..04beb2261
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct GF_PICTURE {
+ YV12_BUFFER_CONFIG *frame;
+ int ref_frame[3];
+ FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index a7f5117cf..97f182c66 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -16,7 +16,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
// Compute (a-b)**2 for 8 pixels with size 16-bit
static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
@@ -141,11 +141,12 @@ static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
count_u16 = _mm_adds_epu16(count_u16, sum_u16);
_mm_storeu_si128((__m128i *)count, count_u16);
- pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+ pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+ pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index 87e68fb43..7571bfcca 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -16,7 +16,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
// difference squared, and store as unsigned 16-bit integer to dst.
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 0e04a2f41..000000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
- int_mv result;
- result.as_mv.row = row;
- result.as_mv.col = col;
- return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
- // This is simplified from the C implementation to utilise that
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and
- // x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
- return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
- int *const comp_cost[2]) {
- return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
- comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
- int sad_per_bit) {
- const int_mv diff =
- pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
- return ROUND_POWER_OF_TWO(
- (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
- VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilizes 3 properties of the cost function lookup tables, *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
- * vp9_encoder.c. *
- * For the joint cost: *
- * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
- * For the component costs: *
- * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
- * (Equal costs for both components) *
- * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
- * (Cost function is even) *
- * If these do not hold, then this function cannot be used without *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties. *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
- const search_site_config *cfg, MV *ref_mv,
- MV *best_mv, int search_param, int sad_per_bit,
- int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv) {
- const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
- const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
- const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
- const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
-
- const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
- const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
- const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
- // search_param determines the length of the initial step and hence the number
- // of iterations.
- // 0 = initial step (MAX_FIRST_STEP) pel
- // 1 = (MAX_FIRST_STEP/2) pel,
- // 2 = (MAX_FIRST_STEP/4) pel...
- const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
- const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
- const int tot_steps = cfg->total_steps - search_param;
-
- const int_mv fcenter_mv =
- pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
- const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
-
- const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
- const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
- int_mv bmv = pack_int_mv(ref_row, ref_col);
- int_mv new_bmv = bmv;
- __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
-
- const int what_stride = x->plane[0].src.stride;
- const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
- const uint8_t *const what = x->plane[0].src.buf;
- const uint8_t *const in_what =
- x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-
- // Work out the start point for the search
- const uint8_t *best_address = in_what;
- const uint8_t *new_best_address = best_address;
-#if VPX_ARCH_X86_64
- __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
- __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
- unsigned int best_sad;
- int i, j, step;
-
- // Check the prerequisite cost function properties that are easy to check
- // in an assert. See the function-level documentation for details on all
- // prerequisites.
- assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
- assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
- // Check the starting position
- best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
- best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
- *num00 = 0;
-
- for (i = 0, step = 0; step < tot_steps; step++) {
- for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
- __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
-#if VPX_ARCH_X86_64
- __m128i v_blocka[2];
-#else
- __m128i v_blocka[1];
-#endif
-
- // Compute the candidate motion vectors
- const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
- const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
- // Clamp them to the search bounds
- __m128i v_these_mv_clamp_w = v_these_mv_w;
- v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
- v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
- // The ones that did not change are inside the search area
- v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
- // If none of them are inside, then move on
- if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
- continue;
- }
-
- // The inverse mask indicates which of the MVs are outside
- v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
- // Shift right to keep the sign bit clear, we will use this later
- // to set the cost to the maximum value.
- v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
- // Compute the difference MV
- v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
- // We utilise the fact that the cost function is even, and use the
- // absolute difference. This allows us to use unsigned indexes later
- // and reduces cache pressure somewhat as only a half of the table
- // is ever referenced.
- v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
- // Compute the SIMD pointer offsets.
- {
-#if VPX_ARCH_X86_64 // sizeof(intptr_t) == 8
- // Load the offsets
- __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
- __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
- // Set the ones falling outside to zero
- v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
- v_bo32_q =
- _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
- // Compute the candidate addresses
- v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
- v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else // VPX_ARCH_X86 // sizeof(intptr_t) == 4
- __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
- v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
- v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
- }
-
- fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
- in_what_stride, (uint32_t *)&v_sad_d);
-
- // Look up the component cost of the residual motion vector
- {
- const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
- const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
- const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
- const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
- const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
- const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
- const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
- const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
- // Note: This is a use case for vpgather in AVX2
- const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
- const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
- const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
- const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
- __m128i v_cost_10_d, v_cost_32_d;
- v_cost_10_d = _mm_cvtsi32_si128(cost0);
- v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
- v_cost_32_d = _mm_cvtsi32_si128(cost2);
- v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
- v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
- }
-
- // Now add in the joint cost
- {
- const __m128i v_sel_d =
- _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
- const __m128i v_joint_cost_d =
- _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
- v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
- }
-
- // Multiply by sad_per_bit
- v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
- // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
- v_cost_d = _mm_add_epi32(v_cost_d,
- _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
- v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
- // Add the cost to the sad
- v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
- // Make the motion vectors outside the search area have max cost
- // by or'ing in the comparison mask, this way the minimum search won't
- // pick them.
- v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
- // Find the minimum value and index horizontally in v_sad_d
- {
- // Try speculatively on 16 bits, so we can use the minpos intrinsic
- const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
- const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
- uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
- uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
- // If the local best value is not saturated, just use it, otherwise
- // find the horizontal minimum again the hard way on 32 bits.
- // This is executed rarely.
- if (UNLIKELY(local_best_sad == 0xffff)) {
- __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
- v_loval_d = v_sad_d;
- v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
- v_hival_d = _mm_srli_si128(v_loval_d, 8);
- v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
- v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
- v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
- v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
- v_hival_d = _mm_srli_si128(v_loval_d, 4);
- v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
- v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
- v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
- v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
- local_best_sad = _mm_extract_epi32(v_loval_d, 0);
- local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
- }
-
- // Update the global minimum if the local minimum is smaller
- if (LIKELY(local_best_sad < best_sad)) {
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
- new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
- new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
- best_sad = local_best_sad;
- }
- }
- }
-
- bmv = new_bmv;
- best_address = new_best_address;
-
- v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
-#if VPX_ARCH_X86_64
- v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
- v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
- if (UNLIKELY(best_address == in_what)) {
- (*num00)++;
- }
- }
-
- *best_mv = bmv.as_mv;
- return best_sad;
-}
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index bf0e8b121..94506aad0 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -469,18 +469,18 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
// It's used to choose the src offset and filter coefficient offset.
const int offset_idx1 = (offset1_q4 >> 4) & 1;
const int offset_idx2 = (offset2_q4 >> 4) & 1;
- static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+ static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
shuffle_filter_ssse3, shuffle_filter_odd_ssse3
};
- static const convolve8_funcs convolve8_funcs[2] = {
+ static const convolve8_funcs kConvolve8Funcs[2] = {
convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
};
assert(w && h);
shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
- shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
- shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+ kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+ kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
// Sub 64 to avoid overflow.
// Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
@@ -522,11 +522,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
// 04 14 24 34 44 54 64 74
// 05 15 25 35 45 55 65 75
d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
- d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
- d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
- d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
- d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+ d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
// 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
// 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
@@ -598,11 +598,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
loadu_8bit_16x4(t, stride_hor, &s[4]);
d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
- d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
- d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
- d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
- d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+ d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
// 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index da285be8e..bf44b0867 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -16,6 +16,8 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
// Zero fill 8 positions in the output buffer.
static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
@@ -29,11 +31,13 @@ static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
}
static VPX_FORCE_INLINE void load_fp_values_avx2(
- const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
- __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
- *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant) {
+ *round = _mm256_castsi128_si256(
+ _mm_load_si128((const __m128i *)mb_plane->round_fp));
*round = _mm256_permute4x64_epi64(*round, 0x54);
- *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_castsi128_si256(
+ _mm_load_si128((const __m128i *)mb_plane->quant_fp));
*quant = _mm256_permute4x64_epi64(*quant, 0x54);
*dequant =
_mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
@@ -98,13 +102,13 @@ static VPX_FORCE_INLINE void quantize_fp_16(
}
void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
__m256i round, quant, dequant, thr;
__m256i eob_max = _mm256_setzero_si256();
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
coeff_ptr += n_coeffs;
iscan += n_coeffs;
@@ -113,8 +117,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
n_coeffs = -n_coeffs;
// Setup global values
- load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
- &dequant);
+ load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
thr = _mm256_setzero_si256();
quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
@@ -203,14 +206,13 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16(
}
void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
__m256i round, quant, dequant, thr;
__m256i eob_max = _mm256_setzero_si256();
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
coeff_ptr += n_coeffs;
iscan += n_coeffs;
@@ -219,8 +221,7 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
n_coeffs = -n_coeffs;
// Setup global values
- load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
- &dequant);
+ load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
thr = _mm256_srli_epi16(dequant, 2);
quant = _mm256_slli_epi16(quant, 1);
{
@@ -286,16 +287,17 @@ static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
}
static VPX_FORCE_INLINE void highbd_load_fp_values(
- const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
- __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
- *round = highbd_init_256(round_ptr);
- *quant = highbd_init_256(quant_ptr);
+ const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant) {
+ *round = highbd_init_256(mb_plane->round_fp);
+ *quant = highbd_init_256(mb_plane->quant_fp);
*dequant = highbd_init_256(dequant_ptr);
}
static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
- const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask =
+ _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
const __m256i packed_nz_mask_perm =
_mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
const __m256i iscan =
@@ -324,16 +326,15 @@ static VPX_FORCE_INLINE void highbd_quantize_fp(
}
void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const int step = 8;
__m256i round, quant, dequant;
__m256i eob_max = _mm256_setzero_si256();
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
coeff_ptr += n_coeffs;
iscan += n_coeffs;
@@ -342,8 +343,7 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
n_coeffs = -n_coeffs;
// Setup global values
- highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
- &dequant);
+ highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
iscan + n_coeffs, qcoeff_ptr + n_coeffs,
@@ -390,14 +390,14 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
}
void vp9_highbd_quantize_fp_32x32_avx2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
- const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *const scan_order) {
const int step = 8;
__m256i round, quant, dequant, thr;
__m256i eob_max = _mm256_setzero_si256();
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
coeff_ptr += n_coeffs;
iscan += n_coeffs;
@@ -406,8 +406,7 @@ void vp9_highbd_quantize_fp_32x32_avx2(
n_coeffs = -n_coeffs;
// Setup global values
- highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
- &dequant);
+ highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
thr = _mm256_srli_epi32(dequant, 2);
// Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
// calculating the zbin mask.
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index c87723443..2481eb366 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -17,12 +17,14 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
__m128i thr;
int nzflag;
@@ -31,11 +33,10 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
__m128i qcoeff0, qcoeff1;
__m128i eob;
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// Setup global values.
- load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+ load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
index d35004e37..98decae74 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.c
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -17,12 +17,14 @@
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/quantize_sse2.h"
#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
__m128i thr;
int nzflag;
@@ -31,11 +33,10 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i coeff0, coeff1;
__m128i qcoeff0, qcoeff1;
__m128i eob;
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// Setup global values.
- load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+ load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
// Do DC and first 15 AC.
coeff0 = load_tran_low(coeff_ptr);
@@ -119,12 +120,11 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
}
void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
+ const struct macroblock_plane *const mb_plane,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
+ const struct ScanOrder *const scan_order) {
const __m128i zero = _mm_setzero_si128();
const __m128i one_s16 = _mm_set1_epi16(1);
__m128i thr;
@@ -134,11 +134,10 @@ void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i coeff0, coeff1;
__m128i qcoeff0, qcoeff1;
__m128i eob;
-
- (void)scan;
+ const int16_t *iscan = scan_order->iscan;
// Setup global values.
- load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+ load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
// The 32x32 halves round.
round = _mm_add_epi16(round, one_s16);
round = _mm_srli_epi16(round, 1);
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 02e50a857..fd81bce7b 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -25,22 +25,16 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
VP9RateControlRTC());
if (!rc_api) return nullptr;
rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
- if (!rc_api->cpi_) {
- rc_api.reset();
- return nullptr;
- }
+ if (!rc_api->cpi_) return nullptr;
vp9_zero(*rc_api->cpi_);
- rc_api->InitRateControl(cfg);
+ if (!rc_api->InitRateControl(cfg)) return nullptr;
if (cfg.aq_mode) {
VP9_COMP *const cpi = rc_api->cpi_;
cpi->segmentation_map = static_cast<uint8_t *>(
vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
sizeof(*cpi->segmentation_map)));
- if (!cpi->segmentation_map) {
- rc_api.reset();
- return nullptr;
- }
+ if (!cpi->segmentation_map) return nullptr;
cpi->cyclic_refresh =
vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
cpi->cyclic_refresh->content_mode = 0;
@@ -48,7 +42,30 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
return rc_api;
}
-void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
+VP9RateControlRTC::~VP9RateControlRTC() {
+ if (cpi_) {
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1) {
+ for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+ for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+ LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+ vpx_free(lc->map);
+ vpx_free(lc->last_coded_q_map);
+ vpx_free(lc->consec_zero_mv);
+ }
+ }
+ }
+ if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ vpx_free(cpi_->segmentation_map);
+ cpi_->segmentation_map = NULL;
+ vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+ }
+ vpx_free(cpi_);
+ }
+}
+
+bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
VP9_COMMON *cm = &cpi_->common;
VP9EncoderConfig *oxcf = &cpi_->oxcf;
RATE_CONTROL *const rc = &cpi_->rc;
@@ -65,7 +82,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
cm->current_video_frame = 0;
rc->kf_boost = DEFAULT_KF_BOOST;
- UpdateRateControl(rc_cfg);
+ if (!UpdateRateControl(rc_cfg)) return false;
vp9_set_mb_mi(cm, cm->width, cm->height);
cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
@@ -79,10 +96,21 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
vp9_rc_init(oxcf, 0, rc);
rc->constrain_gf_key_freq_onepass_vbr = 0;
cpi_->sf.use_nonrd_pick_mode = 1;
+ return true;
}
-void VP9RateControlRTC::UpdateRateControl(
+bool VP9RateControlRTC::UpdateRateControl(
const VP9RateControlRtcConfig &rc_cfg) {
+ // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5)
+ // and VPX_TS_MAX_LAYERS (5), check all three.
+ if (rc_cfg.ss_number_layers < 1 ||
+ rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS ||
+ rc_cfg.ts_number_layers < 1 ||
+ rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS ||
+ rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) {
+ return false;
+ }
+
VP9_COMMON *cm = &cpi_->common;
VP9EncoderConfig *oxcf = &cpi_->oxcf;
RATE_CONTROL *const rc = &cpi_->rc;
@@ -102,6 +130,8 @@ void VP9RateControlRTC::UpdateRateControl(
oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz;
oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+ oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+ oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT;
oxcf->ss_number_layers = rc_cfg.ss_number_layers;
oxcf->ts_number_layers = rc_cfg.ts_number_layers;
oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)(
@@ -112,7 +142,19 @@ void VP9RateControlRTC::UpdateRateControl(
cpi_->framerate = rc_cfg.framerate;
cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
+
vp9_set_mb_mi(cm, cm->width, cm->height);
+
+ if (setjmp(cpi_->common.error.jmp)) {
+ cpi_->common.error.setjmp = 0;
+ vpx_clear_system_state();
+ return false;
+ }
+ cpi_->common.error.setjmp = 1;
+
+ for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+ oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
+ }
for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
const int layer =
@@ -126,21 +168,33 @@ void VP9RateControlRTC::UpdateRateControl(
lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
- oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
}
}
vp9_set_rc_buffer_sizes(cpi_);
vp9_new_framerate(cpi_, cpi_->framerate);
if (cpi_->svc.number_temporal_layers > 1 ||
cpi_->svc.number_spatial_layers > 1) {
- if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_);
+ if (cm->current_video_frame == 0) {
+ vp9_init_layer_context(cpi_);
+ // svc->framedrop_mode is not currently exposed, so only allow for
+ // full superframe drop for now.
+ cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP;
+ }
vp9_update_layer_context_change_config(cpi_,
(int)cpi_->oxcf.target_bandwidth);
+ cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop;
}
vp9_check_reset_rc_flag(cpi_);
+
+ cpi_->common.error.setjmp = 0;
+ return true;
}
-void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
+// Compute the QP for the frame. If the frame is dropped this function
+// returns kDrop, and no QP is computed. If the frame is encoded (not dropped)
+// the QP is computed and kOk is returned.
+FrameDropDecision VP9RateControlRTC::ComputeQP(
+ const VP9FrameParamsQpRTC &frame_params) {
VP9_COMMON *const cm = &cpi_->common;
int width, height;
cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
@@ -157,7 +211,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
cm->height = height;
}
vp9_set_mb_mi(cm, cm->width, cm->height);
- cm->frame_type = frame_params.frame_type;
+ cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
// This is needed to ensure key frame does not get unset in rc_get_svc_params.
cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
@@ -192,11 +246,51 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
vp9_restore_layer_context(cpi_);
vp9_rc_get_svc_params(cpi_);
}
+ if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer);
+ // SVC: check for skip encoding of enhancement layer if the
+ // layer target bandwidth = 0.
+ if (vp9_svc_check_skip_enhancement_layer(cpi_))
+ return FrameDropDecision::kDrop;
+ // Check for dropping this frame based on buffer level.
+ // Never drop on key frame, or if base layer is key for svc,
+ if (!frame_is_intra_only(cm) &&
+ (!cpi_->use_svc ||
+ !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) {
+ if (vp9_rc_drop_frame(cpi_)) {
+ // For FULL_SUPERFRAME_DROP mode (the only mode considered here):
+ // if the superframe drop is decided we need to save the layer context for
+ // all spatial layers, and call update_buffer_level and postencode_drop
+ // for all spatial layers.
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1) {
+ vp9_save_layer_context(cpi_);
+ for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) {
+ cpi_->svc.spatial_layer_id = sl;
+ vp9_restore_layer_context(cpi_);
+ vp9_update_buffer_level_svc_preencode(cpi_);
+ vp9_rc_postencode_update_drop_frame(cpi_);
+ vp9_save_layer_context(cpi_);
+ }
+ }
+ return FrameDropDecision::kDrop;
+ }
+ }
+ // Compute the QP for the frame.
int bottom_index, top_index;
cpi_->common.base_qindex =
vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1)
+ vp9_save_layer_context(cpi_);
+
+ cpi_->last_frame_dropped = 0;
+ cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0;
+ if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+ cpi_->svc.num_encoded_top_layer++;
+
+ return FrameDropDecision::kOk;
}
int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
@@ -219,7 +313,31 @@ bool VP9RateControlRTC::GetSegmentationData(
return true;
}
-void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+void VP9RateControlRTC::PostEncodeUpdate(
+ uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) {
+ cpi_->common.frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+ cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+ cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1) {
+ vp9_restore_layer_context(cpi_);
+ const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+ cpi_->svc.temporal_layer_id,
+ cpi_->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+ cpi_->common.base_qindex = lc->frame_qp;
+ cpi_->common.MBs = lc->MBs;
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial
+ // layers, for the base temporal layer.
+ if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi_->svc.number_spatial_layers > 1 &&
+ cpi_->svc.temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh;
+ cr->qindex_delta[0] = lc->qindex_delta[0];
+ cr->qindex_delta[1] = lc->qindex_delta[1];
+ cr->qindex_delta[2] = lc->qindex_delta[2];
+ }
+ }
vp9_rc_postencode_update(cpi_, encoded_frame_size);
if (cpi_->svc.number_spatial_layers > 1 ||
cpi_->svc.number_temporal_layers > 1)
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index b209e4db6..85005c547 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -14,22 +14,20 @@
#include <cstdint>
#include <memory>
-#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/vp9_iface_common.h"
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/vp9_cx_iface.h"
#include "vpx/internal/vpx_ratectrl_rtc.h"
#include "vpx_mem/vpx_mem.h"
-namespace libvpx {
+struct VP9_COMP;
+namespace libvpx {
struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
public:
VP9RateControlRtcConfig() {
+ ss_number_layers = 1;
vp9_zero(max_quantizers);
vp9_zero(min_quantizers);
vp9_zero(scaling_factor_den);
@@ -40,20 +38,21 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
scaling_factor_den[0] = 1;
max_quantizers[0] = max_quantizer;
min_quantizers[0] = min_quantizer;
+ max_consec_drop = INT_MAX;
}
// Number of spatial layers
int ss_number_layers;
- // Number of temporal layers
- int ts_number_layers;
int max_quantizers[VPX_MAX_LAYERS];
int min_quantizers[VPX_MAX_LAYERS];
int scaling_factor_num[VPX_SS_MAX_LAYERS];
int scaling_factor_den[VPX_SS_MAX_LAYERS];
+ // This is only for SVC for now.
+ int max_consec_drop;
};
struct VP9FrameParamsQpRTC {
- FRAME_TYPE frame_type;
+ RcFrameType frame_type;
int spatial_layer_id;
int temporal_layer_id;
};
@@ -69,63 +68,46 @@ struct VP9SegmentationData {
// the encoder. To use this interface, you need to link with libvpxrc.a.
//
// #include "vp9/ratectrl_rtc.h"
-// VP9RateControlRTC rc_api;
// VP9RateControlRtcConfig cfg;
// VP9FrameParamsQpRTC frame_params;
//
// YourFunctionToInitializeConfig(cfg);
-// rc_api.InitRateControl(cfg);
+// std::unique_ptr<VP9RateControlRTC> rc_api = VP9RateControlRTC::Create(cfg);
// // start encoding
// while (frame_to_encode) {
// if (config_changed)
-// rc_api.UpdateRateControl(cfg);
+// rc_api->UpdateRateControl(cfg);
// YourFunctionToFillFrameParams(frame_params);
-// rc_api.ComputeQP(frame_params);
-// YourFunctionToUseQP(rc_api.GetQP());
-// YourFunctionToUseLoopfilter(rc_api.GetLoopfilterLevel());
+// rc_api->ComputeQP(frame_params);
+// YourFunctionToUseQP(rc_api->GetQP());
+// YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel());
// // After encoding
-// rc_api.PostEncode(encoded_frame_size);
+// rc_api->PostEncode(encoded_frame_size, frame_params);
// }
class VP9RateControlRTC {
public:
static std::unique_ptr<VP9RateControlRTC> Create(
const VP9RateControlRtcConfig &cfg);
- ~VP9RateControlRTC() {
- if (cpi_) {
- if (cpi_->svc.number_spatial_layers > 1 ||
- cpi_->svc.number_temporal_layers > 1) {
- for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
- for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
- int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
- LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
- vpx_free(lc->map);
- vpx_free(lc->last_coded_q_map);
- vpx_free(lc->consec_zero_mv);
- }
- }
- }
- if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
- vpx_free(cpi_->segmentation_map);
- cpi_->segmentation_map = NULL;
- vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
- }
- vpx_free(cpi_);
- }
- }
+ ~VP9RateControlRTC();
- void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
+ bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
// GetQP() needs to be called after ComputeQP() to get the latest QP
int GetQP() const;
int GetLoopfilterLevel() const;
bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
- void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
+ // ComputeQP computes the QP if the frame is not dropped (kOk return),
+ // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+ // are not to be called (vp9_rc_postencode_update_drop_frame is already
+ // called via ComputeQP if drop is decided).
+ FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params);
// Feedback to rate control with the size of current encoded frame
- void PostEncodeUpdate(uint64_t encoded_frame_size);
+ void PostEncodeUpdate(uint64_t encoded_frame_size,
+ const VP9FrameParamsQpRTC &frame_params);
private:
VP9RateControlRTC() {}
- void InitRateControl(const VP9RateControlRtcConfig &cfg);
- VP9_COMP *cpi_;
+ bool InitRateControl(const VP9RateControlRtcConfig &cfg);
+ struct VP9_COMP *cpi_;
};
} // namespace libvpx
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index f42912d35..2e6f9a451 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -143,7 +143,6 @@ get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) {
default:
fprintf(stderr, "Unsupported update_type %d\n", update_type);
abort();
- return kFrameTypeInter;
}
}
@@ -183,10 +182,11 @@ static void update_motion_vector_info(
const MV_REFERENCE_FRAME *in_ref_frame =
input_motion_vector_info[i].ref_frame;
output_motion_vector_info[i].mv_count =
- (in_ref_frame[0] == INTRA_FRAME) ? 0
- : ((in_ref_frame[1] == NONE) ? 1 : 2);
- if (in_ref_frame[0] == NONE) {
- fprintf(stderr, "in_ref_frame[0] shouldn't be NONE\n");
+ (in_ref_frame[0] == INTRA_FRAME)
+ ? 0
+ : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2);
+ if (in_ref_frame[0] == NO_REF_FRAME) {
+ fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n");
abort();
}
output_motion_vector_info[i].ref_frame[0] =
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index 7920e95ee..d610a5e15 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -309,7 +309,7 @@ struct EncodeFrameResult {
// The tpl stats stored in the vector is according to the encoding order.
// For example, suppose there are N show frames for the current GOP.
// Then tpl_stats_info[0] stores the information of the first frame to be
- // encoded for this GOP, i.e, the AltRef frame.
+ // encoded for this GOP, i.e., the AltRef frame.
std::vector<TplStatsInfo> tpl_stats_info;
ImageBuffer coded_frame;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index dee175dc0..e738feda0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -29,6 +29,8 @@
#include "vp9/vp9_cx_iface.h"
#include "vp9/vp9_iface_common.h"
+#include "vpx/vpx_tpl.h"
+
typedef struct vp9_extracfg {
int cpu_used; // available cpu percentage in 1/16
unsigned int enable_auto_alt_ref;
@@ -129,6 +131,8 @@ struct vpx_codec_alg_priv {
BufferPool *buffer_pool;
};
+// Called by encoder_set_config() and encoder_encode() only. Must not be called
+// by encoder_init().
static vpx_codec_err_t update_error_state(
vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
const vpx_codec_err_t res = error->error_code;
@@ -635,8 +639,12 @@ static vpx_codec_err_t set_encoder_config(
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
- oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
- 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+ const int layer = sl * oxcf->ts_number_layers + tl;
+ if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000)
+ oxcf->layer_target_bitrate[layer] = INT_MAX;
+ else
+ oxcf->layer_target_bitrate[layer] =
+ 1000 * cfg->layer_target_bitrate[layer];
}
}
if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
@@ -789,10 +797,22 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
ERROR("Cannot change width or height after initialization");
- if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+ // Note: function encoder_set_config() is allowed to be called multiple
+ // times. However, when the original frame width or height is less than two
+ // times of the new frame width or height, a forced key frame should be
+ // used. To make sure the correct detection of a forced key frame, we need
+ // to update the frame width and height only when the actual encoding is
+ // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+ // track the actual coded frame size.
+ if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height &&
+ !valid_ref_frame_size(ctx->cpi->last_coded_width,
+ ctx->cpi->last_coded_height, cfg->g_w,
+ cfg->g_h)) ||
(ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
- (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+ (ctx->cpi->initial_height &&
+ (int)cfg->g_h > ctx->cpi->initial_height)) {
force_key = 1;
+ }
}
// Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -813,6 +833,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
assert(codec_err != VPX_CODEC_OK);
return codec_err;
}
+ ctx->cpi->common.error.setjmp = 1;
ctx->cfg = *cfg;
set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
@@ -1068,6 +1089,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
cpi->compute_frame_low_motion_onepass = 0;
cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
cpi->cyclic_refresh->content_mode = 0;
+ cpi->disable_scene_detection_rtc_ratectrl = 1;
}
return VPX_CODEC_OK;
}
@@ -1300,6 +1322,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
+ cpi->last_coded_width = ctx->oxcf.width;
+ cpi->last_coded_height = ctx->oxcf.height;
+
if (img != NULL) {
res = validate_img(ctx, img);
if (res == VPX_CODEC_OK) {
@@ -1631,13 +1656,9 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
if (data) {
vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
- if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
- roi->delta_q, roi->delta_lf, roi->skip,
- roi->ref_frame)) {
- return VPX_CODEC_OK;
- }
- return VPX_CODEC_INVALID_PARAM;
+ return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+ roi->delta_q, roi->delta_lf, roi->skip,
+ roi->ref_frame);
}
return VPX_CODEC_INVALID_PARAM;
}
@@ -1675,9 +1696,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
if (mode) {
- const int res =
- vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
- (VPX_SCALING)mode->v_scaling_mode);
+ const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
+ mode->v_scaling_mode);
return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
}
return VPX_CODEC_INVALID_PARAM;
@@ -1933,16 +1953,28 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
const FRAME_INFO *frame_info = &cpi->frame_info;
vpx_rc_config_t ratectrl_config;
vpx_codec_err_t codec_status;
+ memset(&ratectrl_config, 0, sizeof(ratectrl_config));
ratectrl_config.frame_width = frame_info->frame_width;
ratectrl_config.frame_height = frame_info->frame_height;
ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames;
-
+ ratectrl_config.max_gf_interval = oxcf->max_gf_interval;
+ ratectrl_config.min_gf_interval = oxcf->min_gf_interval;
// TODO(angiebird): Double check whether this is the proper way to set up
// target_bitrate and frame_rate.
ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
+ ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+ ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+
+ if (oxcf->rc_mode == VPX_VBR) {
+ ratectrl_config.rc_mode = VPX_RC_VBR;
+ } else if (oxcf->rc_mode == VPX_Q) {
+ ratectrl_config.rc_mode = VPX_RC_QMODE;
+ } else if (oxcf->rc_mode == VPX_CQ) {
+ ratectrl_config.rc_mode = VPX_RC_CQ;
+ }
codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
if (codec_status != VPX_CODEC_OK) {
@@ -2065,8 +2097,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
0, // rc_resize_allowed
0, // rc_scaled_width
0, // rc_scaled_height
- 60, // rc_resize_down_thresold
- 30, // rc_resize_up_thresold
+ 60, // rc_resize_down_thresh
+ 30, // rc_resize_up_thresh
VPX_VBR, // rc_end_usage
{ NULL, 0 }, // rc_twopass_stats_in
@@ -2099,7 +2131,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
{ 0 }, // ts_rate_decimator
0, // ts_periodicity
{ 0 }, // ts_layer_id
- { 0 }, // layer_taget_bitrate
+ { 0 }, // layer_target_bitrate
0, // temporal_layering_mode
0, // use_vizier_rc_params
{ 1, 1 }, // active_wq_factor
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index bdfe21793..a242c776c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -256,6 +256,7 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
} while (0)
static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+ vpx_codec_err_t res;
ctx->last_show_frame = -1;
ctx->need_resync = 1;
ctx->flushed = 0;
@@ -265,6 +266,8 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
if (ctx->pbi == NULL) {
+ vpx_free(ctx->buffer_pool);
+ ctx->buffer_pool = NULL;
set_error_detail(ctx, "Failed to allocate decoder");
return VPX_CODEC_MEM_ERROR;
}
@@ -282,7 +285,14 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
set_default_ppflags(&ctx->postproc_cfg);
- return init_buffer_callbacks(ctx);
+ res = init_buffer_callbacks(ctx);
+ if (res != VPX_CODEC_OK) {
+ vpx_free(ctx->buffer_pool);
+ ctx->buffer_pool = NULL;
+ vp9_decoder_remove(ctx->pbi);
+ ctx->pbi = NULL;
+ }
+ return res;
}
static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
@@ -348,7 +358,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
// Initialize the decoder on the first frame.
if (ctx->pbi == NULL) {
- const vpx_codec_err_t res = init_decoder(ctx);
+ res = init_decoder(ctx);
if (res != VPX_CODEC_OK) return res;
}
@@ -367,7 +377,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
for (i = 0; i < frame_count; ++i) {
const uint8_t *data_start_copy = data_start;
const uint32_t frame_size = frame_sizes[i];
- vpx_codec_err_t res;
if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
set_error_detail(ctx, "Invalid frame size in index");
return VPX_CODEC_CORRUPT_FRAME;
@@ -382,8 +391,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *const data_end = data + data_sz;
while (data_start < data_end) {
const uint32_t frame_size = (uint32_t)(data_end - data_start);
- const vpx_codec_err_t res =
- decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+ res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
if (res != VPX_CODEC_OK) return res;
// Account for suboptimal termination by the encoder.
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 9072628f2..44790ef6a 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -40,6 +40,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
VP9_CX_SRCS-yes += encoder/vp9_extend.h
VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h
VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c
VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
@@ -104,20 +105,24 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
endif
VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
endif
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
@@ -134,11 +139,12 @@ endif
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
-endif
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c
+endif
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@@ -156,8 +162,10 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c