diff options
author | Vignesh Venkatasubramanian <vigneshv@google.com> | 2014-08-15 09:32:37 -0700 |
---|---|---|
committer | Vignesh Venkatasubramanian <vigneshv@google.com> | 2014-08-15 09:32:37 -0700 |
commit | ba6c59e9d7d7013b3906b6f4230b663422681848 (patch) | |
tree | fd3686d3a1a5598e6ea35b0006cfeb5d40a9eaa3 /libvpx/vp9 | |
parent | bbeabeb879e7fa51c6395ee7ad590617dfbd5299 (diff) | |
download | libvpx-ba6c59e9d7d7013b3906b6f4230b663422681848.tar.gz |
libvpx: Pull from upstream
Upstream hash: d4a47a6cc0d869bea3071c15bc61da6836026d0b
Pull latest libvpx from upstream. This fixes a few vp9 encoder bugs and
includes some optimizations. Also fixes a couple of configure flags in
x86 to be consistent with the rest.
Change-Id: Ic58e0b03cce832571a35ec73eec559cdf881d1f5
Diffstat (limited to 'libvpx/vp9')
156 files changed, 13411 insertions, 15166 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c index d8b24bfaf..f0881b5ae 100644 --- a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c @@ -25,12 +25,14 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, // Account for the vertical phase needing 3 lines prior and 4 lines post int intermediate_height = h + 7; - if (x_step_q4 != 16 || y_step_q4 != 16) - return vp9_convolve8_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + if (x_step_q4 != 16 || y_step_q4 != 16) { + vp9_convolve8_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + return; + } /* Filter starting 3 lines back. The neon implementation will ignore the * given height and filter a multiple of 4 lines. Since this goes in to @@ -57,12 +59,14 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); int intermediate_height = h + 7; - if (x_step_q4 != 16 || y_step_q4 != 16) - return vp9_convolve8_avg_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + if (x_step_q4 != 16 || y_step_q4 != 16) { + vp9_convolve8_avg_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + return; + } /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. diff --git a/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm index 54764008b..ab5bb6920 100644 --- a/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm @@ -9,7 +9,7 @@ ; EXPORT |vp9_idct8x8_64_add_neon| - EXPORT |vp9_idct8x8_10_add_neon| + EXPORT |vp9_idct8x8_12_add_neon| ARM REQUIRE8 PRESERVE8 @@ -310,13 +310,13 @@ bx lr ENDP ; |vp9_idct8x8_64_add_neon| -;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_idct8x8_10_add_neon| PROC +|vp9_idct8x8_12_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_idct8x8_10_add_neon| + ENDP ; |vp9_idct8x8_12_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 0820db247..bc6a17cd1 100644 --- a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -9,6 +9,7 @@ */ #include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit0, diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index acccaea6d..fc44ffa31 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, } } -void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, +void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride) { DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index f44ada1b9..b379656b3 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -28,7 +28,10 @@ static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) { vpx_memset(&mi[i * cm->mi_stride], 0, sizeof(*mi)); } -static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { +void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + cm->mi_cols = aligned_width >> MI_SIZE_LOG2; cm->mi_rows = aligned_height >> MI_SIZE_LOG2; cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE; @@ -53,32 +56,41 @@ static void setup_mi(VP9_COMMON *cm) { } static int alloc_mi(VP9_COMMON *cm, int mi_size) { - cm->mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip)); - if (cm->mip == NULL) - return 1; + int i; - cm->prev_mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->prev_mip)); - if (cm->prev_mip == NULL) - return 1; + for (i = 0; i < 2; ++i) { + cm->mip_array[i] = + (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip)); + if (cm->mip_array[i] == NULL) + return 1; - cm->mi_grid_base = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); - if (cm->mi_grid_base == NULL) - return 1; + cm->mi_grid_base_array[i] = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); + if (cm->mi_grid_base_array[i] == NULL) + return 1; + } + + // Init the index. + cm->mi_idx = 0; + cm->prev_mi_idx = 1; - cm->prev_mi_grid_base = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); - if (cm->prev_mi_grid_base == NULL) - return 1; + cm->mip = cm->mip_array[cm->mi_idx]; + cm->prev_mip = cm->mip_array[cm->prev_mi_idx]; + cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx]; + cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx]; return 0; } static void free_mi(VP9_COMMON *cm) { - vpx_free(cm->mip); - vpx_free(cm->prev_mip); - vpx_free(cm->mi_grid_base); - vpx_free(cm->prev_mi_grid_base); + int i; + + for (i = 0; i < 2; ++i) { + vpx_free(cm->mip_array[i]); + cm->mip_array[i] = NULL; + vpx_free(cm->mi_grid_base_array[i]); + cm->mi_grid_base_array[i] = NULL; + } cm->mip = NULL; cm->prev_mip = NULL; @@ -86,7 +98,7 @@ static void free_mi(VP9_COMMON *cm) { cm->prev_mi_grid_base = NULL; } -void vp9_free_frame_buffers(VP9_COMMON *cm) { +void vp9_free_ref_frame_buffers(VP9_COMMON *cm) { int i; for (i = 0; i < FRAME_BUFFERS; ++i) { @@ -100,7 +112,9 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) { } vp9_free_frame_buffer(&cm->post_proc_buffer); +} +void vp9_free_context_buffers(VP9_COMMON *cm) { free_mi(cm); vpx_free(cm->last_frame_seg_map); @@ -113,137 +127,95 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) { cm->above_seg_context = NULL; } -int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { - const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); - const int ss_x = cm->subsampling_x; - const int ss_y = cm->subsampling_y; - - if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) - goto fail; - - set_mb_mi(cm, aligned_width, aligned_height); - - free_mi(cm); - if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) - goto fail; +int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { + vp9_free_context_buffers(cm); - setup_mi(cm); + vp9_set_mb_mi(cm, width, height); + if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail; - // Create the segmentation map structure and set to 0. - vpx_free(cm->last_frame_seg_map); cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1); - if (!cm->last_frame_seg_map) - goto fail; + if (!cm->last_frame_seg_map) goto fail; - vpx_free(cm->above_context); - cm->above_context = - (ENTROPY_CONTEXT *)vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) * - MAX_MB_PLANE, - sizeof(*cm->above_context)); - if (!cm->above_context) - goto fail; + cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( + 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, + sizeof(*cm->above_context)); + if (!cm->above_context) goto fail; - vpx_free(cm->above_seg_context); - cm->above_seg_context = - (PARTITION_CONTEXT *)vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols), - sizeof(*cm->above_seg_context)); - if (!cm->above_seg_context) - goto fail; + cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc( + mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context)); + if (!cm->above_seg_context) goto fail; return 0; fail: - vp9_free_frame_buffers(cm); + vp9_free_context_buffers(cm); return 1; } -int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { - const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); - const int ss_x = cm->subsampling_x; - const int ss_y = cm->subsampling_y; +static void init_frame_bufs(VP9_COMMON *cm) { int i; - vp9_free_frame_buffers(cm); - - for (i = 0; i < FRAME_BUFFERS; i++) { - cm->frame_bufs[i].ref_count = 0; - if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height, - ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0) - goto fail; - } - cm->new_fb_idx = FRAME_BUFFERS - 1; cm->frame_bufs[cm->new_fb_idx].ref_count = 1; - for (i = 0; i < REF_FRAMES; i++) { + for (i = 0; i < REF_FRAMES; ++i) { cm->ref_frame_map[i] = i; cm->frame_bufs[i].ref_count = 1; } +} - if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9_ENC_BORDER_IN_PIXELS) < 0) - goto fail; - - set_mb_mi(cm, aligned_width, aligned_height); +int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { + int i; + const int ss_x = cm->subsampling_x; + const int ss_y = cm->subsampling_y; - if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) - goto fail; + vp9_free_ref_frame_buffers(cm); - setup_mi(cm); - - // Create the segmentation map structure and set to 0. - cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1); - if (!cm->last_frame_seg_map) - goto fail; + for (i = 0; i < FRAME_BUFFERS; ++i) { + cm->frame_bufs[i].ref_count = 0; + if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height, + ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0) + goto fail; + } - cm->above_context = - (ENTROPY_CONTEXT *)vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) * - MAX_MB_PLANE, - sizeof(*cm->above_context)); - if (!cm->above_context) - goto fail; + init_frame_bufs(cm); - cm->above_seg_context = - (PARTITION_CONTEXT *)vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols), - sizeof(*cm->above_seg_context)); - if (!cm->above_seg_context) +#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC + if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; +#endif return 0; fail: - vp9_free_frame_buffers(cm); + vp9_free_ref_frame_buffers(cm); return 1; } void vp9_remove_common(VP9_COMMON *cm) { - vp9_free_frame_buffers(cm); + vp9_free_ref_frame_buffers(cm); + vp9_free_context_buffers(cm); vp9_free_internal_frame_buffers(&cm->int_frame_buffers); } -void vp9_update_frame_size(VP9_COMMON *cm) { - const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2); - - set_mb_mi(cm, aligned_width, aligned_height); +void vp9_init_context_buffers(VP9_COMMON *cm) { setup_mi(cm); - - // Initialize the previous frame segment map to 0. if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { + // Swap indices. + const int tmp = cm->mi_idx; + cm->mi_idx = cm->prev_mi_idx; + cm->prev_mi_idx = tmp; + // Current mip will be the prev_mip for the next frame. - MODE_INFO *temp = cm->prev_mip; - MODE_INFO **temp2 = cm->prev_mi_grid_base; - cm->prev_mip = cm->mip; - cm->mip = temp; - cm->prev_mi_grid_base = cm->mi_grid_base; - cm->mi_grid_base = temp2; + cm->mip = cm->mip_array[cm->mi_idx]; + cm->prev_mip = cm->mip_array[cm->prev_mi_idx]; + cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx]; + cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx]; // Update the upper left visible macroblock ptrs. cm->mi = cm->mip + cm->mi_stride + 1; diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h index 06636a905..c5b893fac 100644 --- a/libvpx/vp9/common/vp9_alloccommon.h +++ b/libvpx/vp9/common/vp9_alloccommon.h @@ -20,14 +20,17 @@ struct VP9Common; void vp9_remove_common(struct VP9Common *cm); -int vp9_resize_frame_buffers(struct VP9Common *cm, int width, int height); +int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height); +void vp9_init_context_buffers(struct VP9Common *cm); +void vp9_free_context_buffers(struct VP9Common *cm); -int vp9_alloc_frame_buffers(struct VP9Common *cm, int width, int height); +int vp9_alloc_ref_frame_buffers(struct VP9Common *cm, int width, int height); +void vp9_free_ref_frame_buffers(struct VP9Common *cm); -void vp9_free_frame_buffers(struct VP9Common *cm); - -void vp9_update_frame_size(struct VP9Common *cm); +int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height); +void vp9_free_state_buffers(struct VP9Common *cm); +void vp9_set_mb_mi(struct VP9Common *cm, int width, int height); void vp9_swap_mi_and_prev_mi(struct VP9Common *cm); #ifdef __cplusplus diff --git a/libvpx/vp9/common/vp9_blockd.c b/libvpx/vp9/common/vp9_blockd.c index fedfb18d9..dab8f9617 100644 --- a/libvpx/vp9/common/vp9_blockd.c +++ b/libvpx/vp9/common/vp9_blockd.c @@ -10,8 +10,8 @@ #include "vp9/common/vp9_blockd.h" -MB_PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *left_mi, int b) { +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b) { if (b == 0 || b == 2) { if (!left_mi || is_inter_block(&left_mi->mbmi)) return DC_PRED; @@ -23,8 +23,8 @@ MB_PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, } } -MB_PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *above_mi, int b) { +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { if (b == 0 || b == 1) { if (!above_mi || is_inter_block(&above_mi->mbmi)) return DC_PRED; @@ -44,7 +44,7 @@ void vp9_foreach_transformed_block_in_plane( // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 // transform size varies per plane, look it up in a common way. - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; @@ -146,10 +146,4 @@ void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) { xd->plane[i].subsampling_x = i ? ss_x : 0; xd->plane[i].subsampling_y = i ? ss_y : 0; } -#if CONFIG_ALPHA - // TODO(jkoleszar): Using the Y w/h for now - xd->plane[3].plane_type = PLANE_TYPE_Y; - xd->plane[3].subsampling_x = 0; - xd->plane[3].subsampling_y = 0; -#endif } diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index 55320a6a4..951e6e023 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -77,9 +77,9 @@ typedef enum { ZEROMV, NEWMV, MB_MODE_COUNT -} MB_PREDICTION_MODE; +} PREDICTION_MODE; -static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { +static INLINE int is_inter_mode(PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } @@ -94,7 +94,7 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { is a single probability table. */ typedef struct { - MB_PREDICTION_MODE as_mode; + PREDICTION_MODE as_mode; int_mv as_mv[2]; // first, second inter predictor motion vectors } b_mode_info; @@ -122,14 +122,14 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) { typedef struct { // Common for both INTER and INTRA blocks BLOCK_SIZE sb_type; - MB_PREDICTION_MODE mode; + PREDICTION_MODE mode; TX_SIZE tx_size; - uint8_t skip; - uint8_t segment_id; - uint8_t seg_id_predicted; // valid only when temporal_update is enabled + int8_t skip; + int8_t segment_id; + int8_t seg_id_predicted; // valid only when temporal_update is enabled // Only for INTRA blocks - MB_PREDICTION_MODE uv_mode; + PREDICTION_MODE uv_mode; // Only for INTER blocks MV_REFERENCE_FRAME ref_frame[2]; @@ -144,7 +144,7 @@ typedef struct { b_mode_info bmi[4]; } MODE_INFO; -static INLINE MB_PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { +static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode : mi->mbmi.mode; } @@ -157,22 +157,18 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[1] > INTRA_FRAME; } -MB_PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *left_mi, int b); +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b); -MB_PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *above_mi, int b); +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b); enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; -#if CONFIG_ALPHA -enum { MAX_MB_PLANE = 4 }; -#else enum { MAX_MB_PLANE = 3 }; -#endif struct buf_2d { uint8_t *buf; @@ -228,8 +224,6 @@ typedef struct macroblockd { DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); int lossless; - /* Inverse transform function pointers. */ - void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); int corrupted; @@ -244,9 +238,7 @@ typedef struct macroblockd { static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) { - const BLOCK_SIZE subsize = subsize_lookup[partition][bsize]; - assert(subsize < BLOCK_SIZES); - return subsize; + return subsize_lookup[partition][bsize]; } extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES]; @@ -272,25 +264,25 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y); -static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) { +static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize, + int xss, int yss) { if (bsize < BLOCK_8X8) { return TX_4X4; } else { - // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1) - const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1]; + const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss]; return MIN(y_tx_size, max_txsize_lookup[plane_bsize]); } } -static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { - return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type); +static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi, + const struct macroblockd_plane *pd) { + return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); } static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, const struct macroblockd_plane *pd) { - BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; - assert(bs < BLOCK_SIZES); - return bs; + return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; } typedef void (*foreach_transformed_block_visitor)(int plane, int block, diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h index 2dccb7031..2788e66f4 100644 --- a/libvpx/vp9/common/vp9_common.h +++ b/libvpx/vp9/common/vp9_common.h @@ -45,11 +45,11 @@ extern "C" { vpx_memcpy(dest, src, n * sizeof(*src)); \ } -#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)) +#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest)) #define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)) static INLINE uint8_t clip_pixel(int val) { - return (val > 255) ? 255u : (val < 0) ? 0u : val; + return (val > 255) ? 255 : (val < 0) ? 0 : val; } static INLINE int clamp(int value, int low, int high) { diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c index a927823e0..d4c1b7124 100644 --- a/libvpx/vp9/common/vp9_common_data.c +++ b/libvpx/vp9/common/vp9_common_data.c @@ -107,6 +107,13 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { TX_32X32, TX_32X32, TX_32X32, TX_32X32 }; +const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 +}; + const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_4X4, // ONLY_4X4 TX_8X8, // ALLOW_8X8 diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h index f41962747..a06c9bed8 100644 --- a/libvpx/vp9/common/vp9_common_data.h +++ b/libvpx/vp9/common/vp9_common_data.h @@ -29,6 +29,7 @@ extern const int num_pels_log2_lookup[BLOCK_SIZES]; extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; +extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES]; extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES]; extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c index d30e0b488..d8aaf32c4 100644 --- a/libvpx/vp9/common/vp9_convolve.c +++ b/libvpx/vp9/common/vp9_convolve.c @@ -117,17 +117,25 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *const y_filters, int y0_q4, int y_step_q4, int w, int h) { - // Fixed size intermediate buffer places limits on parameters. - // Maximum intermediate_height is 324, for y_step_q4 == 80, - // h == 64, taps == 8. - // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc - uint8_t temp[64 * 324]; + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint8_t temp[135 * 64]; int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(y_step_q4 <= 80); - assert(x_step_q4 <= 80); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); if (intermediate_height < h) intermediate_height = h; @@ -156,6 +164,9 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h); } @@ -168,6 +179,9 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h); } @@ -179,6 +193,10 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h); } @@ -190,6 +208,10 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h); } @@ -232,6 +254,9 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { int r; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + for (r = h; r > 0; --r) { vpx_memcpy(dst, src, w); src += src_stride; @@ -246,6 +271,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { int x, y; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c index 8f150a406..d2522bbdf 100644 --- a/libvpx/vp9/common/vp9_debugmodes.c +++ b/libvpx/vp9/common/vp9_debugmodes.c @@ -24,10 +24,9 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { */ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, size_t member_offset) { - int mi_row; - int mi_col; + int mi_row, mi_col; int mi_index = 0; - MODE_INFO **mi_8x8 = cm->mi_grid_visible; + MODE_INFO **mi = cm->mi_grid_visible; int rows = cm->mi_rows; int cols = cm->mi_cols; char prefix = descriptor[0]; @@ -38,7 +37,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(file, "%2d ", - *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) + + *((int*) ((char *) (&mi[mi_index]->mbmi) + member_offset))); mi_index++; } @@ -52,7 +51,7 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { int mi_col; int mi_index = 0; FILE *mvs = fopen(file, "a"); - MODE_INFO **mi_8x8 = cm->mi_grid_visible; + MODE_INFO **mi = cm->mi_grid_visible; int rows = cm->mi_rows; int cols = cm->mi_cols; @@ -67,8 +66,8 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { for (mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs, "V "); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row, - mi_8x8[mi_index]->mbmi.mv[0].as_mv.col); + fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row, + mi[mi_index]->mbmi.mv[0].as_mv.col); mi_index++; } fprintf(mvs, "\n"); diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index bc12f9aa2..3a54de225 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -15,6 +15,14 @@ #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" +const vp9_prob vp9_cat1_prob[] = { 159 }; +const vp9_prob vp9_cat2_prob[] = { 165, 145 }; +const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 }; +const vp9_prob vp9_cat4_prob[] = { 176, 155, 140, 135 }; +const vp9_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 }; +const vp9_prob vp9_cat6_prob[] = { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 +}; const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index 6788eb698..8a10f2320 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -43,6 +43,21 @@ extern "C" { DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); +#define CAT1_MIN_VAL 5 +#define CAT2_MIN_VAL 7 +#define CAT3_MIN_VAL 11 +#define CAT4_MIN_VAL 19 +#define CAT5_MIN_VAL 35 +#define CAT6_MIN_VAL 67 + +// Extra bit probabilities. +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]); + #define EOB_MODEL_TOKEN 3 extern const vp9_tree_index vp9_coefmodel_tree[]; @@ -168,19 +183,20 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, break; default: assert(0 && "Invalid transform size."); + break; } return combine_entropy_contexts(above_ec, left_ec); } -static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, +static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, PLANE_TYPE type, int block_idx) { const MODE_INFO *const mi = xd->mi[0]; if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { return &vp9_default_scan_orders[tx_size]; } else { - const MB_PREDICTION_MODE mode = get_y_mode(mi, block_idx); + const PREDICTION_MODE mode = get_y_mode(mi, block_idx); return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; } } diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index c7b191177..533757bef 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -101,8 +101,8 @@ static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, const MODE_INFO *above_mi, const MODE_INFO *left_mi, int block) { - const MB_PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); - const MB_PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); return vp9_kf_y_mode_prob[above][left]; } diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index 068284faa..d77631341 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -25,15 +25,18 @@ extern "C" { #define MI_MASK (MI_BLOCK_SIZE - 1) -// Bitstream profiles indicated by 2 bits in the uncompressed header. -// 00: Profile 0. 4:2:0 only. -// 10: Profile 1. adds 4:4:4, 4:2:2, alpha. -// 01: Profile 2. Supports 10-bit and 12-bit color only. -// 11: Undefined profile. +// Bitstream profiles indicated by 2-3 bits in the uncompressed header. +// 00: Profile 0. 8-bit 4:2:0 only. +// 10: Profile 1. 8-bit 4:4:4, 4:2:2, and 4:4:0. +// 01: Profile 2. 10-bit and 12-bit color only, with 4:2:0 sampling. +// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0 +// sampling. +// 111: Undefined profile. typedef enum BITSTREAM_PROFILE { PROFILE_0, PROFILE_1, PROFILE_2, + PROFILE_3, MAX_PROFILES } BITSTREAM_PROFILE; diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c index 7474a88bc..afcdf22ec 100644 --- a/libvpx/vp9/common/vp9_filter.c +++ b/libvpx/vp9/common/vp9_filter.c @@ -32,7 +32,8 @@ const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { }; // Lagrangian interpolation filter -const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = { +DECLARE_ALIGNED(256, const InterpKernel, + vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -52,7 +53,8 @@ const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = { }; // DCT based filter -const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = { +DECLARE_ALIGNED(256, const InterpKernel, + vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -72,7 +74,8 @@ const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = { }; // freqmultiplier = 0.5 -const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = { +DECLARE_ALIGNED(256, const InterpKernel, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h index 29d3867c9..8c359c717 100644 --- a/libvpx/vp9/common/vp9_filter.h +++ b/libvpx/vp9/common/vp9_filter.h @@ -41,12 +41,6 @@ const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter); DECLARE_ALIGNED(256, extern const InterpKernel, vp9_bilinear_filters[SUBPEL_SHIFTS]); -DECLARE_ALIGNED(256, extern const InterpKernel, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS]); -DECLARE_ALIGNED(256, extern const InterpKernel, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]); -DECLARE_ALIGNED(256, extern const InterpKernel, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]); // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. diff --git a/libvpx/vp9/common/vp9_frame_buffers.c b/libvpx/vp9/common/vp9_frame_buffers.c index a0b1e039c..733b3a927 100644 --- a/libvpx/vp9/common/vp9_frame_buffers.c +++ b/libvpx/vp9/common/vp9_frame_buffers.c @@ -76,6 +76,7 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size, int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; (void)cb_priv; - int_fb->in_use = 0; + if (int_fb) + int_fb->in_use = 0; return 0; } diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c index 20b78bfed..856d41e70 100644 --- a/libvpx/vp9/common/vp9_idct.c +++ b/libvpx/vp9/common/vp9_idct.c @@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, } } -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; @@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { if (eob == 1) // DC only DCT coefficient vp9_idct8x8_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct8x8_10_add(input, dest, stride); + else if (eob <= 12) + vp9_idct8x8_12_add(input, dest, stride); else vp9_idct8x8_64_add(input, dest, stride); } diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h index ceca7951b..7f595e1cc 100644 --- a/libvpx/vp9/common/vp9_idct.h +++ b/libvpx/vp9/common/vp9_idct.h @@ -33,8 +33,8 @@ extern "C" { #define pair_set_epi16(a, b) \ _mm_set_epi16(b, a, b, a, b, a, b, a) -#define pair_set_epi32(a, b) \ - _mm_set_epi32(b, a, b, a) +#define dual_set_epi16(a, b) \ + _mm_set_epi16(b, b, b, b, a, a, a, a) // Constants: // for (int i = 1; i< 32; ++i) @@ -81,6 +81,16 @@ static const int sinpi_4_9 = 15212; static INLINE int dct_const_round_shift(int input) { int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + assert(INT16_MIN <= rv); + assert(rv <= INT16_MAX); +#endif return (int16_t)rv; } diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index 3ac5a0577..3b39d4274 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -16,7 +16,7 @@ #include "vp9/common/vp9_seg_common.h" -// 64 bit masks for left transform size. Each 1 represents a position where +// 64 bit masks for left transform size. Each 1 represents a position where // we should apply a loop filter across the left border of an 8x8 block // boundary. // @@ -34,13 +34,13 @@ // // A loopfilter should be applied to every other 8x8 horizontally. static const uint64_t left_64x64_txform_mask[TX_SIZES]= { - 0xffffffffffffffff, // TX_4X4 - 0xffffffffffffffff, // TX_8x8 - 0x5555555555555555, // TX_16x16 - 0x1111111111111111, // TX_32x32 + 0xffffffffffffffff, // TX_4X4 + 0xffffffffffffffff, // TX_8x8 + 0x5555555555555555, // TX_16x16 + 0x1111111111111111, // TX_32x32 }; -// 64 bit masks for above transform size. Each 1 represents a position where +// 64 bit masks for above transform size. Each 1 represents a position where // we should apply a loop filter across the top border of an 8x8 block // boundary. // @@ -58,15 +58,15 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= { // // A loopfilter should be applied to every other 4 the row vertically. static const uint64_t above_64x64_txform_mask[TX_SIZES]= { - 0xffffffffffffffff, // TX_4X4 - 0xffffffffffffffff, // TX_8x8 - 0x00ff00ff00ff00ff, // TX_16x16 - 0x000000ff000000ff, // TX_32x32 + 0xffffffffffffffff, // TX_4X4 + 0xffffffffffffffff, // TX_8x8 + 0x00ff00ff00ff00ff, // TX_16x16 + 0x000000ff000000ff, // TX_32x32 }; -// 64 bit masks for prediction sizes (left). Each 1 represents a position -// where left border of an 8x8 block. These are aligned to the right most -// appropriate bit, and then shifted into place. +// 64 bit masks for prediction sizes (left). Each 1 represents a position +// where left border of an 8x8 block. These are aligned to the right most +// appropriate bit, and then shifted into place. // // In the case of TX_16x32 -> ( low order byte first ) we end up with // a mask that looks like this : @@ -80,54 +80,54 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= { // 00000000 // 00000000 static const uint64_t left_prediction_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4, - 0x0000000000000001, // BLOCK_4X8, - 0x0000000000000001, // BLOCK_8X4, - 0x0000000000000001, // BLOCK_8X8, - 0x0000000000000101, // BLOCK_8X16, - 0x0000000000000001, // BLOCK_16X8, - 0x0000000000000101, // BLOCK_16X16, - 0x0000000001010101, // BLOCK_16X32, - 0x0000000000000101, // BLOCK_32X16, - 0x0000000001010101, // BLOCK_32X32, - 0x0101010101010101, // BLOCK_32X64, - 0x0000000001010101, // BLOCK_64X32, - 0x0101010101010101, // BLOCK_64X64 + 0x0000000000000001, // BLOCK_4X4, + 0x0000000000000001, // BLOCK_4X8, + 0x0000000000000001, // BLOCK_8X4, + 0x0000000000000001, // BLOCK_8X8, + 0x0000000000000101, // BLOCK_8X16, + 0x0000000000000001, // BLOCK_16X8, + 0x0000000000000101, // BLOCK_16X16, + 0x0000000001010101, // BLOCK_16X32, + 0x0000000000000101, // BLOCK_32X16, + 0x0000000001010101, // BLOCK_32X32, + 0x0101010101010101, // BLOCK_32X64, + 0x0000000001010101, // BLOCK_64X32, + 0x0101010101010101, // BLOCK_64X64 }; // 64 bit mask to shift and set for each prediction size. static const uint64_t above_prediction_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4 - 0x0000000000000001, // BLOCK_4X8 - 0x0000000000000001, // BLOCK_8X4 - 0x0000000000000001, // BLOCK_8X8 - 0x0000000000000001, // BLOCK_8X16, - 0x0000000000000003, // BLOCK_16X8 - 0x0000000000000003, // BLOCK_16X16 - 0x0000000000000003, // BLOCK_16X32, - 0x000000000000000f, // BLOCK_32X16, - 0x000000000000000f, // BLOCK_32X32, - 0x000000000000000f, // BLOCK_32X64, - 0x00000000000000ff, // BLOCK_64X32, - 0x00000000000000ff, // BLOCK_64X64 + 0x0000000000000001, // BLOCK_4X4 + 0x0000000000000001, // BLOCK_4X8 + 0x0000000000000001, // BLOCK_8X4 + 0x0000000000000001, // BLOCK_8X8 + 0x0000000000000001, // BLOCK_8X16, + 0x0000000000000003, // BLOCK_16X8 + 0x0000000000000003, // BLOCK_16X16 + 0x0000000000000003, // BLOCK_16X32, + 0x000000000000000f, // BLOCK_32X16, + 0x000000000000000f, // BLOCK_32X32, + 0x000000000000000f, // BLOCK_32X64, + 0x00000000000000ff, // BLOCK_64X32, + 0x00000000000000ff, // BLOCK_64X64 }; -// 64 bit mask to shift and set for each prediction size. A bit is set for +// 64 bit mask to shift and set for each prediction size. A bit is set for // each 8x8 block that would be in the left most block of the given block // size in the 64x64 block. static const uint64_t size_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4 - 0x0000000000000001, // BLOCK_4X8 - 0x0000000000000001, // BLOCK_8X4 - 0x0000000000000001, // BLOCK_8X8 - 0x0000000000000101, // BLOCK_8X16, - 0x0000000000000003, // BLOCK_16X8 - 0x0000000000000303, // BLOCK_16X16 - 0x0000000003030303, // BLOCK_16X32, - 0x0000000000000f0f, // BLOCK_32X16, - 0x000000000f0f0f0f, // BLOCK_32X32, - 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64, - 0x00000000ffffffff, // BLOCK_64X32, - 0xffffffffffffffff, // BLOCK_64X64 + 0x0000000000000001, // BLOCK_4X4 + 0x0000000000000001, // BLOCK_4X8 + 0x0000000000000001, // BLOCK_8X4 + 0x0000000000000001, // BLOCK_8X8 + 0x0000000000000101, // BLOCK_8X16, + 0x0000000000000003, // BLOCK_16X8 + 0x0000000000000303, // BLOCK_16X16 + 0x0000000003030303, // BLOCK_16X32, + 0x0000000000000f0f, // BLOCK_32X16, + 0x000000000f0f0f0f, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64, + 0x00000000ffffffff, // BLOCK_64X32, + 0xffffffffffffffff, // BLOCK_64X64 }; // These are used for masking the left and above borders. @@ -136,67 +136,67 @@ static const uint64_t above_border = 0x000000ff000000ff; // 16 bit masks for uv transform sizes. static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= { - 0xffff, // TX_4X4 - 0xffff, // TX_8x8 - 0x5555, // TX_16x16 - 0x1111, // TX_32x32 + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x5555, // TX_16x16 + 0x1111, // TX_32x32 }; static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= { - 0xffff, // TX_4X4 - 0xffff, // TX_8x8 - 0x0f0f, // TX_16x16 - 0x000f, // TX_32x32 + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x0f0f, // TX_16x16 + 0x000f, // TX_32x32 }; // 16 bit left mask to shift and set for each uv prediction size. static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = { - 0x0001, // BLOCK_4X4, - 0x0001, // BLOCK_4X8, - 0x0001, // BLOCK_8X4, - 0x0001, // BLOCK_8X8, - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8, - 0x0001, // BLOCK_16X16, - 0x0011, // BLOCK_16X32, - 0x0001, // BLOCK_32X16, - 0x0011, // BLOCK_32X32, - 0x1111, // BLOCK_32X64 - 0x0011, // BLOCK_64X32, - 0x1111, // BLOCK_64X64 + 0x0001, // BLOCK_4X4, + 0x0001, // BLOCK_4X8, + 0x0001, // BLOCK_8X4, + 0x0001, // BLOCK_8X8, + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8, + 0x0001, // BLOCK_16X16, + 0x0011, // BLOCK_16X32, + 0x0001, // BLOCK_32X16, + 0x0011, // BLOCK_32X32, + 0x1111, // BLOCK_32X64 + 0x0011, // BLOCK_64X32, + 0x1111, // BLOCK_64X64 }; // 16 bit above mask to shift and set for uv each prediction size. static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = { - 0x0001, // BLOCK_4X4 - 0x0001, // BLOCK_4X8 - 0x0001, // BLOCK_8X4 - 0x0001, // BLOCK_8X8 - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8 - 0x0001, // BLOCK_16X16 - 0x0001, // BLOCK_16X32, - 0x0003, // BLOCK_32X16, - 0x0003, // BLOCK_32X32, - 0x0003, // BLOCK_32X64, - 0x000f, // BLOCK_64X32, - 0x000f, // BLOCK_64X64 + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0001, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0003, // BLOCK_32X32, + 0x0003, // BLOCK_32X64, + 0x000f, // BLOCK_64X32, + 0x000f, // BLOCK_64X64 }; // 64 bit mask to shift and set for each uv prediction size static const uint16_t size_mask_uv[BLOCK_SIZES] = { - 0x0001, // BLOCK_4X4 - 0x0001, // BLOCK_4X8 - 0x0001, // BLOCK_8X4 - 0x0001, // BLOCK_8X8 - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8 - 0x0001, // BLOCK_16X16 - 0x0011, // BLOCK_16X32, - 0x0003, // BLOCK_32X16, - 0x0033, // BLOCK_32X32, - 0x3333, // BLOCK_32X64, - 0x00ff, // BLOCK_64X32, - 0xffff, // BLOCK_64X64 + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0011, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0033, // BLOCK_32X32, + 0x3333, // BLOCK_32X64, + 0x00ff, // BLOCK_64X32, + 0xffff, // BLOCK_64X64 }; static const uint16_t left_border_uv = 0x1111; static const uint16_t above_border_uv = 0x000f; @@ -211,7 +211,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { // For each possible value for the loop filter fill out limits for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { - // Set loop filter paramaeters that control sharpness. + // Set loop filter parameters that control sharpness. int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); if (sharpness_lvl > 0) { @@ -250,7 +250,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { int seg_id; - // n_shift is the a multiplier for lf_deltas + // n_shift is the multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 const int scale = 1 << (default_filt_lvl >> 5); @@ -316,8 +316,8 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type, unsigned int mask; for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 | - mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1; - mask; mask >>= 1) { + mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1; + mask; mask >>= 1) { const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); @@ -489,8 +489,8 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } // This function ors into the current lfm structure, where to do loop -// filters for the specific mi we are looking at. It uses information -// including the block_size_type (32x16, 32x32, etc), the transform size, +// filters for the specific mi we are looking at. It uses information +// including the block_size_type (32x16, 32x32, etc.), the transform size, // whether there were any coefficients encoded, and the loop filter strength // block we are currently looking at. Shift is used to position the // 1's we produce. @@ -502,7 +502,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, const MB_MODE_INFO *mbmi = &mi->mbmi; const BLOCK_SIZE block_size = mbmi->sb_type; const TX_SIZE tx_size_y = mbmi->tx_size; - const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi); + const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1); const int filter_level = get_filter_level(lfi_n, mbmi); uint64_t *const left_y = &lfm->left_y[tx_size_y]; uint64_t *const above_y = &lfm->above_y[tx_size_y]; @@ -526,7 +526,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, } // These set 1 in the current block size for the block size edges. - // For instance if the block size is 32x16, we'll set : + // For instance if the block size is 32x16, we'll set: // above = 1111 // 0000 // and @@ -535,7 +535,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // NOTE : In this example the low bit is left most ( 1000 ) is stored as // 1, not 8... // - // U and v set things on a 16 bit scale. + // U and V set things on a 16 bit scale. // *above_y |= above_prediction_mask[block_size] << shift_y; *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; @@ -547,7 +547,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, if (mbmi->skip && is_inter_block(mbmi)) return; - // Here we are adding a mask for the transform size. The transform + // Here we are adding a mask for the transform size. The transform // size mask is set to be correct for a 64x64 prediction block size. We // mask to match the size of the block we are working on and then shift it // into place.. @@ -573,7 +573,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, } // This function does the same thing as the one above with the exception that -// it only affects the y masks. It exists because for blocks < 16x16 in size, +// it only affects the y masks. It exists because for blocks < 16x16 in size, // we only update u and v masks on the first block. static void build_y_mask(const loop_filter_info_n *const lfi_n, const MODE_INFO *mi, const int shift_y, @@ -619,16 +619,16 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, // by mi_row, mi_col. // TODO(JBB): This function only works for yv12. void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi_8x8, const int mode_info_stride, + MODE_INFO **mi, const int mode_info_stride, LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; - MODE_INFO **mip = mi_8x8; - MODE_INFO **mip2 = mi_8x8; + MODE_INFO **mip = mi; + MODE_INFO **mip2 = mi; // These are offsets to the next mi in the 64x64 block. It is what gets - // added to the mi ptr as we go through each loop. It helps us to avoids - // setting up special row and column counters for each index. The last step + // added to the mi ptr as we go through each loop. It helps us to avoid + // setting up special row and column counters for each index. The last step // brings us out back to the starting position. const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4, -(mode_info_stride << 2) - 4}; @@ -637,7 +637,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1}; // Following variables represent shifts to position the current block - // mask over the appropriate block. A shift of 36 to the left will move + // mask over the appropriate block. A shift of 36 to the left will move // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left // 4 rows to the appropriate spot. const int shift_32_y[] = {0, 4, 32, 36}; @@ -652,6 +652,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, cm->mi_cols - mi_col : MI_BLOCK_SIZE); vp9_zero(*lfm); + assert(mip[0] != NULL); // TODO(jimbankoski): Try moving most of the following code into decode // loop and storing lfm in the mbmi structure so that we don't have to go @@ -767,7 +768,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32]; // We do at least 8 tap filter on every 32x32 even if the transform size - // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and + // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and // remove it from the 4x4. lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border; lfm->left_y[TX_4X4] &= ~left_border; @@ -796,7 +797,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->int_4x4_y &= mask_y; lfm->int_4x4_uv &= mask_uv; - // We don't apply a wide loop filter on the last uv block row. If set + // We don't apply a wide loop filter on the last uv block row. If set // apply the shorter one instead. if (rows == 1) { lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16]; @@ -830,7 +831,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->int_4x4_y &= mask_y; lfm->int_4x4_uv &= mask_uv_int; - // We don't apply a wide loop filter on the last uv column. If set + // We don't apply a wide loop filter on the last uv column. If set // apply the shorter one instead. if (columns == 1) { lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16]; @@ -841,7 +842,8 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc); } } - // We don't a loop filter on the first column in the image. Mask that out. + // We don't apply a loop filter on the first column in the image, mask that + // out. if (mi_col == 0) { for (i = 0; i < TX_32X32; i++) { lfm->left_y[i] &= 0xfefefefefefefefe; @@ -939,7 +941,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1; const int skip_this_r = skip_this && !block_edge_above; const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) - ? get_uv_tx_size(&mi[0].mbmi) + ? get_uv_tx_size(&mi[0].mbmi, plane) : mi[0].mbmi.tx_size; const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; @@ -1192,39 +1194,41 @@ void vp9_filter_block_plane(VP9_COMMON *const cm, } void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, - VP9_COMMON *cm, MACROBLOCKD *xd, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; - int mi_row, mi_col; + const int use_420 = y_only || (planes[1].subsampling_y == 1 && + planes[1].subsampling_x == 1); LOOP_FILTER_MASK lfm; - int use_420 = y_only || (xd->plane[1].subsampling_y == 1 && - xd->plane[1].subsampling_x == 1); + int mi_row, mi_col; for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { - MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mi_stride; + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { int plane; - vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col); + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); // TODO(JBB): Make setup_mask work for non 420. if (use_420) - vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mi_stride, + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); for (plane = 0; plane < num_planes; ++plane) { if (use_420) - vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); + vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); else - filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, + filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row, mi_col); } } } } -void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, MACROBLOCKD *xd, int frame_filter_level, int y_only, int partial_frame) { int start_mi_row, end_mi_row, mi_rows_to_filter; @@ -1238,7 +1242,7 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, } end_mi_row = start_mi_row + mi_rows_to_filter; vp9_loop_filter_frame_init(cm, frame_filter_level); - vp9_loop_filter_rows(cm->frame_to_show, cm, xd, + vp9_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only); } @@ -1246,7 +1250,7 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, int vp9_loop_filter_worker(void *arg1, void *arg2) { LFWorkerData *const lf_data = (LFWorkerData*)arg1; (void)arg2; - vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd, + vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); return 1; } diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index 97ae9d22d..6fa2773e5 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -104,22 +104,23 @@ void vp9_loop_filter_init(struct VP9Common *cm); // calls this function directly. void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); -void vp9_loop_filter_frame(struct VP9Common *cm, +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, + struct VP9Common *cm, struct macroblockd *mbd, int filter_level, int y_only, int partial_frame); // Apply the loop filter to [start, stop) macro block rows in frame_buffer. void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, - struct VP9Common *cm, struct macroblockd *xd, + struct VP9Common *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only); typedef struct LoopFilterWorkerData { const YV12_BUFFER_CONFIG *frame_buffer; struct VP9Common *cm; - struct macroblockd xd; // TODO(jzern): most of this is unnecessary to the - // loopfilter. the planes are necessary as their state - // is changed during decode. + struct macroblockd_plane planes[MAX_MB_PLANE]; + int start; int stop; int y_only; diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c index 1aab36205..ab64d3036 100644 --- a/libvpx/vp9/common/vp9_mvref_common.c +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -11,181 +11,6 @@ #include "vp9/common/vp9_mvref_common.h" -#define MVREF_NEIGHBOURS 8 - -typedef struct position { - int row; - int col; -} POSITION; - -typedef enum { - BOTH_ZERO = 0, - ZERO_PLUS_PREDICTED = 1, - BOTH_PREDICTED = 2, - NEW_PLUS_NON_INTRA = 3, - BOTH_NEW = 4, - INTRA_PLUS_NON_INTRA = 5, - BOTH_INTRA = 6, - INVALID_CASE = 9 -} motion_vector_context; - -// This is used to figure out a context for the ref blocks. The code flattens -// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by -// adding 9 for each intra block, 3 for each zero mv and 1 for each new -// motion vector. This single number is then converted into a context -// with a single lookup ( counter_to_context ). -static const int mode_2_counter[MB_MODE_COUNT] = { - 9, // DC_PRED - 9, // V_PRED - 9, // H_PRED - 9, // D45_PRED - 9, // D135_PRED - 9, // D117_PRED - 9, // D153_PRED - 9, // D207_PRED - 9, // D63_PRED - 9, // TM_PRED - 0, // NEARESTMV - 0, // NEARMV - 3, // ZEROMV - 1, // NEWMV -}; - -// There are 3^3 different combinations of 3 counts that can be either 0,1 or -// 2. However the actual count can never be greater than 2 so the highest -// counter we need is 18. 9 is an invalid counter that's never used. -static const int counter_to_context[19] = { - BOTH_PREDICTED, // 0 - NEW_PLUS_NON_INTRA, // 1 - BOTH_NEW, // 2 - ZERO_PLUS_PREDICTED, // 3 - NEW_PLUS_NON_INTRA, // 4 - INVALID_CASE, // 5 - BOTH_ZERO, // 6 - INVALID_CASE, // 7 - INVALID_CASE, // 8 - INTRA_PLUS_NON_INTRA, // 9 - INTRA_PLUS_NON_INTRA, // 10 - INVALID_CASE, // 11 - INTRA_PLUS_NON_INTRA, // 12 - INVALID_CASE, // 13 - INVALID_CASE, // 14 - INVALID_CASE, // 15 - INVALID_CASE, // 16 - INVALID_CASE, // 17 - BOTH_INTRA // 18 -}; - -static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { - // 4X4 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 4X8 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 8X4 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 8X8 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 8X16 - {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, - // 16X8 - {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, - // 16X16 - {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // 16X32 - {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, - // 32X16 - {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // 32X32 - {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // 32X64 - {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, - // 64X32 - {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, - // 64X64 - {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}} -}; - -static const int idx_n_column_to_subblock[4][2] = { - {1, 2}, - {1, 3}, - {3, 2}, - {3, 3} -}; - -// clamp_mv_ref -#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units - -static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { - clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, - xd->mb_to_right_edge + MV_BORDER, - xd->mb_to_top_edge - MV_BORDER, - xd->mb_to_bottom_edge + MV_BORDER); -} - -// This function returns either the appropriate sub block or block's mv -// on whether the block_size < 8x8 and we have check_sub_blocks set. -static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, - int search_col, int block_idx) { - return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8 - ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] - .as_mv[which_mv] - : candidate->mbmi.mv[which_mv]; -} - - -// Performs mv sign inversion if indicated by the reference frame combination. -static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, - const MV_REFERENCE_FRAME this_ref_frame, - const int *ref_sign_bias) { - int_mv mv = mbmi->mv[ref]; - if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { - mv.as_mv.row *= -1; - mv.as_mv.col *= -1; - } - return mv; -} - -// This macro is used to add a motion vector mv_ref list if it isn't -// already in the list. If it's the second motion vector it will also -// skip all additional processing and jump to done! -#define ADD_MV_REF_LIST(mv) \ - do { \ - if (refmv_count) { \ - if ((mv).as_int != mv_ref_list[0].as_int) { \ - mv_ref_list[refmv_count] = (mv); \ - goto Done; \ - } \ - } else { \ - mv_ref_list[refmv_count++] = (mv); \ - } \ - } while (0) - -// If either reference frame is different, not INTRA, and they -// are different from each other scale and add the mv to our list. -#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \ - do { \ - if (is_inter_block(mbmi)) { \ - if ((mbmi)->ref_frame[0] != ref_frame) \ - ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \ - if (has_second_ref(mbmi) && \ - (mbmi)->ref_frame[1] != ref_frame && \ - (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ - ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \ - } \ - } while (0) - - -// Checks that the given mi_row, mi_col and search point -// are inside the borders of the tile. -static INLINE int is_inside(const TileInfo *const tile, - int mi_col, int mi_row, int mi_rows, - const POSITION *mi_pos) { - return !(mi_row + mi_pos->row < 0 || - mi_col + mi_pos->col < tile->mi_col_start || - mi_row + mi_pos->row >= mi_rows || - mi_col + mi_pos->col >= tile->mi_col_end); -} - // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, @@ -195,7 +20,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, int block, int mi_row, int mi_col) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; - const MODE_INFO *prev_mi = cm->prev_mi + const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL; const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h index 903ac02bb..a937b7823 100644 --- a/libvpx/vp9/common/vp9_mvref_common.h +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -21,6 +21,181 @@ extern "C" { #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\ VP9_INTERP_EXTEND) << 3) +#define MVREF_NEIGHBOURS 8 + +typedef struct position { + int row; + int col; +} POSITION; + +typedef enum { + BOTH_ZERO = 0, + ZERO_PLUS_PREDICTED = 1, + BOTH_PREDICTED = 2, + NEW_PLUS_NON_INTRA = 3, + BOTH_NEW = 4, + INTRA_PLUS_NON_INTRA = 5, + BOTH_INTRA = 6, + INVALID_CASE = 9 +} motion_vector_context; + +// This is used to figure out a context for the ref blocks. The code flattens +// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by +// adding 9 for each intra block, 3 for each zero mv and 1 for each new +// motion vector. This single number is then converted into a context +// with a single lookup ( counter_to_context ). +static const int mode_2_counter[MB_MODE_COUNT] = { + 9, // DC_PRED + 9, // V_PRED + 9, // H_PRED + 9, // D45_PRED + 9, // D135_PRED + 9, // D117_PRED + 9, // D153_PRED + 9, // D207_PRED + 9, // D63_PRED + 9, // TM_PRED + 0, // NEARESTMV + 0, // NEARMV + 3, // ZEROMV + 1, // NEWMV +}; + +// There are 3^3 different combinations of 3 counts that can be either 0,1 or +// 2. However the actual count can never be greater than 2 so the highest +// counter we need is 18. 9 is an invalid counter that's never used. +static const int counter_to_context[19] = { + BOTH_PREDICTED, // 0 + NEW_PLUS_NON_INTRA, // 1 + BOTH_NEW, // 2 + ZERO_PLUS_PREDICTED, // 3 + NEW_PLUS_NON_INTRA, // 4 + INVALID_CASE, // 5 + BOTH_ZERO, // 6 + INVALID_CASE, // 7 + INVALID_CASE, // 8 + INTRA_PLUS_NON_INTRA, // 9 + INTRA_PLUS_NON_INTRA, // 10 + INVALID_CASE, // 11 + INTRA_PLUS_NON_INTRA, // 12 + INVALID_CASE, // 13 + INVALID_CASE, // 14 + INVALID_CASE, // 15 + INVALID_CASE, // 16 + INVALID_CASE, // 17 + BOTH_INTRA // 18 +}; + +static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { + // 4X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 4X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X16 + {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, + // 16X8 + {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, + // 16X16 + {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 16X32 + {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, + // 32X16 + {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X32 + {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X64 + {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, + // 64X32 + {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, + // 64X64 + {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}} +}; + +static const int idx_n_column_to_subblock[4][2] = { + {1, 2}, + {1, 3}, + {3, 2}, + {3, 3} +}; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER, + xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); +} + +// This function returns either the appropriate sub block or block's mv +// on whether the block_size < 8x8 and we have check_sub_blocks set. +static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, + int search_col, int block_idx) { + return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8 + ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] + .as_mv[which_mv] + : candidate->mbmi.mv[which_mv]; +} + + +// Performs mv sign inversion if indicated by the reference frame combination. +static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, + const MV_REFERENCE_FRAME this_ref_frame, + const int *ref_sign_bias) { + int_mv mv = mbmi->mv[ref]; + if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + return mv; +} + +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector it will also +// skip all additional processing and jump to done! +#define ADD_MV_REF_LIST(mv) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != mv_ref_list[0].as_int) { \ + mv_ref_list[refmv_count] = (mv); \ + goto Done; \ + } \ + } else { \ + mv_ref_list[refmv_count++] = (mv); \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \ + if (has_second_ref(mbmi) && \ + (mbmi)->ref_frame[1] != ref_frame && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \ + } \ + } while (0) + + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, + int mi_col, int mi_row, int mi_rows, + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < 0 || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= mi_rows || + mi_col + mi_pos->col >= tile->mi_col_end); +} + // TODO(jingning): this mv clamping function should be block size dependent. static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index fe9cc9e6a..dff077c11 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -68,9 +68,6 @@ typedef struct VP9Common { DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); -#if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]); -#endif COLOR_SPACE color_space; @@ -120,7 +117,6 @@ typedef struct VP9Common { // frame header, 3 reset all contexts. int reset_frame_context; - int frame_flags; // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in // MODE_INFO (8-pixel) units. int MBs; @@ -135,14 +131,15 @@ typedef struct VP9Common { int y_dc_delta_q; int uv_dc_delta_q; int uv_ac_delta_q; -#if CONFIG_ALPHA - int a_dc_delta_q; - int a_ac_delta_q; -#endif /* We allocate a MODE_INFO struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ + int mi_idx; + int prev_mi_idx; + MODE_INFO *mip_array[2]; + MODE_INFO **mi_grid_base_array[2]; + MODE_INFO *mip; /* Base of allocated array */ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ @@ -191,11 +188,6 @@ typedef struct VP9Common { int error_resilient_mode; int frame_parallel_decoding_mode; - // Flag indicates if prev_mi can be used in coding: - // 0: encoder assumes decoder does not have prev_mi - // 1: encoder assumes decoder has and uses prev_mi - unsigned int coding_use_prev_mi; - int log2_tile_cols, log2_tile_rows; // Private data associated with the frame buffer callbacks. @@ -210,6 +202,15 @@ typedef struct VP9Common { ENTROPY_CONTEXT *above_context; } VP9_COMMON; +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) + return NULL; + if (cm->ref_frame_map[index] < 0) + return NULL; + assert(cm->ref_frame_map[index] < REF_FRAMES); + return &cm->frame_bufs[cm->ref_frame_map[index]].buf; +} + static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { return &cm->frame_bufs[cm->new_fb_idx].buf; } @@ -253,10 +254,14 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) { xd->mi_stride = cm->mi_stride; } +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm, int ctx) { - return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx] - : cm->fc.partition_prob[ctx]; + return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx] + : cm->fc.partition_prob[ctx]; } static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { @@ -284,19 +289,15 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, xd->left_available = (mi_col > tile->mi_col_start); } -static INLINE MODE_INFO *get_prev_mi(VP9_COMMON *cm) { - const int use_prev_mi = cm->coding_use_prev_mi && - cm->width == cm->last_width && - cm->height == cm->last_height && - !cm->intra_only && - cm->last_show_frame; +static INLINE void set_prev_mi(VP9_COMMON *cm) { + const int use_prev_in_find_mv_refs = cm->width == cm->last_width && + cm->height == cm->last_height && + !cm->intra_only && + cm->last_show_frame; // Special case: set prev_mi to NULL when the previous mode info // context cannot be used. - return use_prev_mi ? &cm->prev_mip[cm->mi_stride + 1] : NULL; -} - -static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { - return cm->frame_type == KEY_FRAME || cm->intra_only; + cm->prev_mi = use_prev_in_find_mv_refs ? + cm->prev_mip + cm->mi_stride + 1 : NULL; } static INLINE void update_partition_context(MACROBLOCKD *xd, diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index 7baa9ee33..abda4e682 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -24,61 +24,7 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9/common/vp9_textblit.h" -#define RGB_TO_YUV(t) \ - ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ - (0.098*(float)(t & 0xff)) + 16), \ - (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \ - (0.439*(float)(t & 0xff)) + 128), \ - ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \ - (0.071*(float)(t & 0xff)) + 128) - -/* global constants */ -#if 0 && CONFIG_POSTPROC_VISUALIZER -static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { - { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ - { RGB_TO_YUV(0x00FF00) }, /* Green */ - { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ - { RGB_TO_YUV(0x006400) }, /* DarkGreen */ - { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ - { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ - { RGB_TO_YUV(0x00008B) }, /* Dark blue */ - { RGB_TO_YUV(0x551A8B) }, /* Purple */ - { RGB_TO_YUV(0xFF0000) } /* Red */ - { RGB_TO_YUV(0xCC33FF) }, /* Magenta */ -}; - -static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = { - { RGB_TO_YUV(0x6633ff) }, /* Purple */ - { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ - { RGB_TO_YUV(0xff33cc) }, /* Pink */ - { RGB_TO_YUV(0xff3366) }, /* Coral */ - { RGB_TO_YUV(0x3366ff) }, /* Blue */ - { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ - { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ - { RGB_TO_YUV(0xff6633) }, /* Orange */ - { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ - { RGB_TO_YUV(0x8ab800) }, /* Green */ - { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ - { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ - { RGB_TO_YUV(0x66ff33) }, /* Light Green */ - { RGB_TO_YUV(0xccff33) }, /* Yellow */ -}; - -static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { - { RGB_TO_YUV(0x00ff00) }, /* Blue */ - { RGB_TO_YUV(0x0000ff) }, /* Green */ - { RGB_TO_YUV(0xffff00) }, /* Yellow */ - { RGB_TO_YUV(0xff0000) }, /* Red */ -}; -#endif - +#if CONFIG_VP9_POSTPROC static const short kernel5[] = { 1, 1, 4, 1, 1 }; @@ -317,19 +263,13 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + 0.0065 + 0.5); int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, - src->alpha_width}; - const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, - src->alpha_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) vp9_post_proc_down_and_across(srcs[i], dsts[i], @@ -343,19 +283,13 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + 0.0065 + 0.5); int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, - src->alpha_width}; - const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, - src->alpha_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) { const int src_stride = src_strides[i]; @@ -448,163 +382,6 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise, } } -/* Blend the macro block with a solid colored square. Leave the - * edges unblended to give distinction to macro blocks in areas - * filled with the same color block. - */ -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - y += 2 * stride + 2; - for (i = 0; i < 12; i++) { - for (j = 0; j < 12; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - u += stride + 1; - v += stride + 1; - - for (i = 0; i < 6; i++) { - for (j = 0; j < 6; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -/* Blend only the edge of the macro block. Leave center - * unblended to allow for other visualizations to be layered. - */ -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - for (i = 0; i < 2; i++) { - for (j = 0; j < 16; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - for (i = 0; i < 12; i++) { - y[0] = (y[0] * alpha + y1_const) >> 16; - y[1] = (y[1] * alpha + y1_const) >> 16; - y[14] = (y[14] * alpha + y1_const) >> 16; - y[15] = (y[15] * alpha + y1_const) >> 16; - y += stride; - } - - for (i = 0; i < 2; i++) { - for (j = 0; j < 16; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (j = 0; j < 8; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - - for (i = 0; i < 6; i++) { - u[0] = (u[0] * alpha + u1_const) >> 16; - v[0] = (v[0] * alpha + v1_const) >> 16; - - u[7] = (u[7] * alpha + u1_const) >> 16; - v[7] = (v[7] * alpha + v1_const) >> 16; - - u += stride; - v += stride; - } - - for (j = 0; j < 8; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } -} - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -static void constrain_line(int x0, int *x1, int y0, int *y1, - int width, int height) { - int dx; - int dy; - - if (*x1 > width) { - dx = *x1 - x0; - dy = *y1 - y0; - - *x1 = width; - if (dx) - *y1 = ((width - x0) * dy) / dx + y0; - } - if (*x1 < 0) { - dx = *x1 - x0; - dy = *y1 - y0; - - *x1 = 0; - if (dx) - *y1 = ((0 - x0) * dy) / dx + y0; - } - if (*y1 > height) { - dx = *x1 - x0; - dy = *y1 - y0; - - *y1 = height; - if (dy) - *x1 = ((height - y0) * dx) / dy + x0; - } - if (*y1 < 0) { - dx = *x1 - x0; - dy = *y1 - y0; - - *y1 = 0; - if (dy) - *x1 = ((0 - y0) * dx) / dy + x0; - } -} - int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) { const int q = MIN(63, cm->lf.filter_level * 10 / 6); @@ -622,6 +399,14 @@ int vp9_post_proc_frame(struct VP9Common *cm, vp9_clear_system_state(); +#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS + if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate post-processing buffer"); +#endif + if (flags & VP9D_DEMACROBLOCK) { deblock_and_de_macro_block(cm->frame_to_show, ppbuf, q + (ppflags->deblocking_level - 5) * 10, 1, 0); @@ -643,328 +428,6 @@ int vp9_post_proc_frame(struct VP9Common *cm, ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride); } -#if 0 && CONFIG_POSTPROC_VISUALIZER - if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { - char message[512]; - snprintf(message, sizeof(message) -1, - "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (cm->frame_type == KEY_FRAME), - cm->refresh_golden_frame, - cm->base_qindex, - cm->filter_level, - flags, - cm->mb_cols, cm->mb_rows); - vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride); - } - - if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { - int i, j; - uint8_t *y_ptr; - int mb_rows = ppbuf->y_height >> 4; - int mb_cols = ppbuf->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = cm->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp9_filter each macro block */ - for (i = 0; i < mb_rows; i++) { - for (j = 0; j < mb_cols; j++) { - char zz[4]; - - snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a'); - - vp9_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - } - } - - if (flags & VP9D_DEBUG_TXT_DC_DIFF) { - int i, j; - uint8_t *y_ptr; - int mb_rows = ppbuf->y_height >> 4; - int mb_cols = ppbuf->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = cm->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp9_filter each macro block */ - for (i = 0; i < mb_rows; i++) { - for (j = 0; j < mb_cols; j++) { - char zz[4]; - int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED && - mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.skip); - - if (cm->frame_type == KEY_FRAME) - snprintf(zz, sizeof(zz) - 1, "a"); - else - snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0'); - - vp9_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - } - } - - if (flags & VP9D_DEBUG_TXT_RATE_INFO) { - char message[512]; - snprintf(message, sizeof(message), - "Bitrate: %10.2f framerate: %10.2f ", - cm->bitrate, cm->framerate); - vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride); - } - - /* Draw motion vectors */ - if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { - int width = ppbuf->y_width; - int height = ppbuf->y_height; - uint8_t *y_buffer = ppbuf->y_buffer; - int y_stride = ppbuf->y_stride; - MODE_INFO *mi = cm->mi; - int x0, y0; - - for (y0 = 0; y0 < height; y0 += 16) { - for (x0 = 0; x0 < width; x0 += 16) { - int x1, y1; - - if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) { - mi++; - continue; - } - - if (mi->mbmi.mode == SPLITMV) { - switch (mi->mbmi.partitioning) { - case PARTITIONING_16X8 : { /* mv_top_bottom */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride); - - break; - } - case PARTITIONING_8X16 : { /* mv_left_right */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride); - - break; - } - case PARTITIONING_8X8 : { /* mv_quarters */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride); - - bmi = &mi->bmi[10]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); - break; - } - case PARTITIONING_4X4: - default : { - union b_mode_info *bmi = mi->bmi; - int bx0, by0; - - for (by0 = y0; by0 < (y0 + 16); by0 += 4) { - for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) { - MV *mv = &bmi->mv.as_mv; - - x1 = bx0 + 2 + (mv->col >> 3); - y1 = by0 + 2 + (mv->row >> 3); - - constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height); - vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride); - - bmi++; - } - } - } - } - } else if (is_inter_mode(mi->mbmi.mode)) { - MV *mv = &mi->mbmi.mv.as_mv; - const int lx0 = x0 + 8; - const int ly0 = y0 + 8; - - x1 = lx0 + (mv->col >> 3); - y1 = ly0 + (mv->row >> 3); - - if (x1 != lx0 && y1 != ly0) { - constrain_line(lx0, &x1, ly0 - 1, &y1, width, height); - vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride); - - constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); - vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); - } else { - vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); - } - } - - mi++; - } - mi++; - } - } - - /* Color in block modes */ - if ((flags & VP9D_DEBUG_CLR_BLK_MODES) - && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { - int y, x; - int width = ppbuf->y_width; - int height = ppbuf->y_height; - uint8_t *y_ptr = ppbuf->y_buffer; - uint8_t *u_ptr = ppbuf->u_buffer; - uint8_t *v_ptr = ppbuf->v_buffer; - int y_stride = ppbuf->y_stride; - MODE_INFO *mi = cm->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (mi->mbmi.mode == I4X4_PRED && - ((ppflags->display_mb_modes_flag & I4X4_PRED) || - ppflags->display_b_modes_flag)) { - int by, bx; - uint8_t *yl, *ul, *vl; - union b_mode_info *bmi = mi->bmi; - - yl = y_ptr + x; - ul = u_ptr + (x >> 1); - vl = v_ptr + (x >> 1); - - for (by = 0; by < 16; by += 4) { - for (bx = 0; bx < 16; bx += 4) { - if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) - || (ppflags->display_mb_modes_flag & I4X4_PRED)) { - Y = B_PREDICTION_MODE_colors[bmi->as_mode][0]; - U = B_PREDICTION_MODE_colors[bmi->as_mode][1]; - V = B_PREDICTION_MODE_colors[bmi->as_mode][2]; - - vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V, - 0xc000, y_stride); - } - bmi++; - } - - yl += y_stride * 4; - ul += y_stride * 1; - vl += y_stride * 1; - } - } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) { - Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; - U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; - V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; - - vp9_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), - Y, U, V, 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } - - /* Color in frame reference blocks */ - if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && - ppflags->display_ref_frame_flag) { - int y, x; - int width = ppbuf->y_width; - int height = ppbuf->y_height; - uint8_t *y_ptr = ppbuf->y_buffer; - uint8_t *u_ptr = ppbuf->u_buffer; - uint8_t *v_ptr = ppbuf->v_buffer; - int y_stride = ppbuf->y_stride; - MODE_INFO *mi = cm->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) { - Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; - U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; - V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; - - vp9_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), - Y, U, V, 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } -#endif - *dest = *ppbuf; /* handle problem with extending borders */ @@ -975,3 +438,4 @@ int vp9_post_proc_frame(struct VP9Common *cm, return 0; } +#endif diff --git a/libvpx/vp9/common/vp9_ppflags.h b/libvpx/vp9/common/vp9_ppflags.h index e8b04d2e6..1644a1bbb 100644 --- a/libvpx/vp9/common/vp9_ppflags.h +++ b/libvpx/vp9/common/vp9_ppflags.h @@ -33,12 +33,6 @@ typedef struct { int post_proc_flag; int deblocking_level; int noise_level; -#if CONFIG_POSTPROC_VISUALIZER - int display_ref_frame_flag; - int display_mb_modes_flag; - int display_b_modes_flag; - int display_mv_flag; -#endif // CONFIG_POSTPROC_VISUALIZER } vp9_ppflags_t; #ifdef __cplusplus diff --git a/libvpx/vp9/common/vp9_pragmas.h b/libvpx/vp9/common/vp9_pragmas.h deleted file mode 100644 index 0efc713ca..000000000 --- a/libvpx/vp9/common/vp9_pragmas.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_PRAGMAS_H_ -#define VP9_COMMON_VP9_PRAGMAS_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef __INTEL_COMPILER -#pragma warning(disable:997 1011 170) -#endif - -#ifdef _MSC_VER -#pragma warning(disable:4799) -#endif - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_COMMON_VP9_PRAGMAS_H_ diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index bc9d6ef5e..014638466 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -353,9 +353,9 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); const int has_above = above_mbmi != NULL; const int has_left = left_mbmi != NULL; - int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size + int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size : max_tx_size; - int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size + int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size : max_tx_size; if (!has_left) left_ctx = above_ctx; @@ -366,7 +366,7 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) { return (above_ctx + left_ctx) > max_tx_size; } -int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, +int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[bsize]; diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index 1a7ba86e4..2c965068a 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -26,7 +26,7 @@ static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) { return xd->left_available ? xd->mi[-1] : NULL; } -int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, +int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col); static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { diff --git a/libvpx/vp9/common/vp9_prob.h b/libvpx/vp9/common/vp9_prob.h index f36148035..fa0e36da4 100644 --- a/libvpx/vp9/common/vp9_prob.h +++ b/libvpx/vp9/common/vp9_prob.h @@ -44,21 +44,12 @@ typedef int8_t vp9_tree_index; typedef const vp9_tree_index vp9_tree[]; static INLINE vp9_prob clip_prob(int p) { - return (p > 255) ? 255u : (p < 1) ? 1u : p; + return (p > 255) ? 255 : (p < 1) ? 1 : p; } -// int64 is not needed for normal frame level calculations. -// However when outputting entropy stats accumulated over many frames -// or even clips we can overflow int math. -#ifdef ENTROPY_STATS static INLINE vp9_prob get_prob(int num, int den) { return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); } -#else -static INLINE vp9_prob get_prob(int num, int den) { - return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); -} -#endif static INLINE vp9_prob get_binary_prob(int n0, int n1) { return get_prob(n0, n0 + n1); diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c index def12554d..3332e58e6 100644 --- a/libvpx/vp9/common/vp9_quant_common.c +++ b/libvpx/vp9/common/vp9_quant_common.c @@ -12,7 +12,6 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" -#if 1 static const int16_t dc_qlookup[QINDEX_RANGE] = { 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, @@ -83,44 +82,6 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = { 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, }; -void vp9_init_quant_tables(void) { } -#else -static int16_t dc_qlookup[QINDEX_RANGE]; -static int16_t ac_qlookup[QINDEX_RANGE]; - -#define ACDC_MIN 8 - -// TODO(dkovalev) move to common and reuse -static double poly3(double a, double b, double c, double d, double x) { - return a*x*x*x + b*x*x + c*x + d; -} - -void vp9_init_quant_tables() { - int i, val = 4; - - // A "real" q of 1.0 forces lossless mode. - // In practice non lossless Q's between 1.0 and 2.0 (represented here by - // integer values from 5-7 give poor rd results (lower psnr and often - // larger size than the lossless encode. To block out those "not very useful" - // values we increment the ac and dc q lookup values by 4 after position 0. - ac_qlookup[0] = val; - dc_qlookup[0] = val; - val += 4; - - for (i = 1; i < QINDEX_RANGE; i++) { - const int ac_val = val; - - val = (int)(val * 1.01975); - if (val == ac_val) - ++val; - - ac_qlookup[i] = (int16_t)ac_val; - dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9, - 0.5, ac_val)); - } -} -#endif - int16_t vp9_dc_quant(int qindex, int delta) { return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; } diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h index 581104006..d1545d93c 100644 --- a/libvpx/vp9/common/vp9_quant_common.h +++ b/libvpx/vp9/common/vp9_quant_common.h @@ -22,8 +22,6 @@ extern "C" { #define QINDEX_RANGE (MAXQ - MINQ + 1) #define QINDEX_BITS 8 -void vp9_init_quant_tables(); - int16_t vp9_dc_quant(int qindex, int delta); int16_t vp9_ac_quant(int qindex, int delta); diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index e722d6a3e..86ae64839 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -113,6 +113,18 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { return res; } +static INLINE int round_mv_comp_q2(int value) { + return (value < 0 ? value - 1 : value + 1) / 2; +} + +static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) { + MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row + + mi->bmi[block1].as_mv[idx].as_mv.row), + round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col + + mi->bmi[block1].as_mv[idx].as_mv.col) }; + return res; +} + // TODO(jkoleszar): yet another mv clamping function :-( MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw, int bh, int ss_x, int ss_y) { @@ -139,6 +151,29 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, return clamped_mv; } +static MV average_split_mvs(const struct macroblockd_plane *pd, + const MODE_INFO *mi, int ref, int block) { + const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0); + MV res = {0, 0}; + switch (ss_idx) { + case 0: + res = mi->bmi[block].as_mv[ref].as_mv; + break; + case 1: + res = mi_mv_pred_q2(mi, ref, block, block + 2); + break; + case 2: + res = mi_mv_pred_q2(mi, ref, block, block + 1); + break; + case 3: + res = mi_mv_pred_q4(mi, ref); + break; + default: + assert(ss_idx <= 3 || ss_idx >= 0); + } + return res; +} + static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, int bw, int bh, int x, int y, int w, int h, @@ -154,14 +189,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; - - // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the - // same MV (the average of the 4 luma MVs) but we could do something - // smarter for non-4:2:0. Just punt for now, pending the changes to get - // rid of SPLITMV mode entirely. const MV mv = mi->mbmi.sb_type < BLOCK_8X8 - ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv - : mi_mv_pred_q4(mi, ref)) + ? average_split_mvs(pd, mi, ref, block) : mi->mbmi.mv[ref].as_mv; // TODO(jkoleszar): This clamping is done in the incorrect place for the @@ -258,16 +287,11 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; - - // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the - // same MV (the average of the 4 luma MVs) but we could do something - // smarter for non-4:2:0. Just punt for now, pending the changes to get - // rid of SPLITMV mode entirely. const MV mv = mi->mbmi.sb_type < BLOCK_8X8 - ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv - : mi_mv_pred_q4(mi, ref)) + ? average_split_mvs(pd, mi, ref, block) : mi->mbmi.mv[ref].as_mv; + // TODO(jkoleszar): This clamping is done in the incorrect place for the // scaling case. It needs to be done on the scaled MV, not the pre-scaling // MV. Note however that it performs the subsampling aware scaling so @@ -365,7 +389,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, } // Skip border extension if block is inside the frame. - if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width || + if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0; // Extend the border. @@ -409,7 +433,7 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, } } -void vp9_setup_dst_planes(MACROBLOCKD *xd, +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, @@ -419,7 +443,7 @@ void vp9_setup_dst_planes(MACROBLOCKD *xd, int i; for (i = 0; i < MAX_MB_PLANE; ++i) { - struct macroblockd_plane *const pd = &xd->plane[i]; + struct macroblockd_plane *const pd = &planes[i]; setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, pd->subsampling_x, pd->subsampling_y); } diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 86f315880..58c596ee8 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -57,7 +57,8 @@ static INLINE void setup_pred_plane(struct buf_2d *dst, dst->stride = stride; } -void vp9_setup_dst_planes(MACROBLOCKD *xd, const YV12_BUFFER_CONFIG *src, +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col); void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c index 44951b54d..403e10590 100644 --- a/libvpx/vp9/common/vp9_reconintra.c +++ b/libvpx/vp9/common/vp9_reconintra.c @@ -31,6 +31,9 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { ADST_ADST, // TM }; +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. #define intra_pred_sized(type, size) \ void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ ptrdiff_t stride, \ @@ -48,7 +51,7 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - + (void) above; // first column for (r = 0; r < bs - 1; ++r) dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); @@ -77,6 +80,7 @@ intra_pred_allsizes(d207) static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; + (void) left; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + @@ -92,6 +96,7 @@ intra_pred_allsizes(d63) static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; + (void) left; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + @@ -184,6 +189,7 @@ intra_pred_allsizes(d153) static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; + (void) left; for (r = 0; r < bs; r++) { vpx_memcpy(dst, above, bs); @@ -195,6 +201,7 @@ intra_pred_allsizes(v) static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; + (void) above; for (r = 0; r < bs; r++) { vpx_memset(dst, left[r], bs); @@ -219,6 +226,8 @@ intra_pred_allsizes(tm) static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; + (void) above; + (void) left; for (r = 0; r < bs; r++) { vpx_memset(dst, 128, bs); @@ -231,6 +240,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; + (void) above; for (i = 0; i < bs; i++) sum += left[i]; @@ -246,6 +256,7 @@ intra_pred_allsizes(dc_left) static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; + (void) left; for (i = 0; i < bs; i++) sum += above[i]; @@ -311,7 +322,7 @@ static void init_intra_pred_fn_ptrs(void) { static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, - MB_PREDICTION_MODE mode, TX_SIZE tx_size, + PREDICTION_MODE mode, TX_SIZE tx_size, int up_available, int left_available, int right_available, int x, int y, int plane) { @@ -434,7 +445,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, } void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, MB_PREDICTION_MODE mode, + TX_SIZE tx_size, PREDICTION_MODE mode, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int aoff, int loff, int plane) { diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h index abc176787..d09d2a129 100644 --- a/libvpx/vp9/common/vp9_reconintra.h +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -19,7 +19,7 @@ extern "C" { #endif void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, MB_PREDICTION_MODE mode, + TX_SIZE tx_size, PREDICTION_MODE mode, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int aoff, int loff, int plane); diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl index b45559245..708f41b87 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -12,8 +12,7 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; struct mv; union int_mv; struct yv12_buffer_config; @@ -59,7 +58,8 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc"; +$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon; add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_4x4/; @@ -71,10 +71,12 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc"; +specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc"; +$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon; add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc"; +specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc"; +$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon; add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc"; @@ -98,7 +100,8 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc"; +$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon; add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_8x8/; @@ -110,10 +113,12 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc"; +specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc"; +$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon; add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; +specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc"; +$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon; add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc"; @@ -137,7 +142,8 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc"; +$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon; add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_16x16/; @@ -149,10 +155,12 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc"; +$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon; add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc"; +$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon; add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc"; @@ -176,7 +184,8 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc"; +$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon; add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_32x32/; @@ -188,10 +197,12 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, specialize qw/vp9_d153_predictor_32x32/; add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc"; +specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc"; +$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon; add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64"; +specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64"; +$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon; add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc"; @@ -209,37 +220,48 @@ specialize qw/vp9_dc_128_predictor_32x32/; # Loopfilter # add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vp9_lpf_vertical_16 sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2/; +$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon; add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vp9_lpf_vertical_16_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/; +$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon; add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/; +$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon; add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/; +$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon; add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/; +specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/; +$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon; add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/; +$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon; add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/; +$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon; add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/; +$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon; add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/; +$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon; add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; +specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/; +$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon; add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/; +$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon; # # post proc @@ -275,71 +297,91 @@ specialize qw/vp9_blend_b/; # Sub Pixel Filters # add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc"; +specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc"; +$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon; add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc"; +specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc"; +$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon; add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon dspr2/; +specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/; +$vp9_convolve8_neon_asm=vp9_convolve8_neon; add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2/; +specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/; +$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon; add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2/; +specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/; +$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon; add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/; +specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/; +$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon; add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/; +specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/; +$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon; add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/; +specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/; +$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon; # # dct # add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/; +specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; +$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/; +specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; +$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon; add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/; +specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/; +$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon; add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/; +specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; +$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon; -add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/; +add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64"; +$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon; add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/; +specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; +$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_256_add sse2 neon dspr2/; +specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; +$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_10_add sse2 neon dspr2/; +specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; +$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/; +specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/; +$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon; add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct32x32_34_add sse2 neon dspr2/; -$vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon; +specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/; +$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon; add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/; +specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/; +$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon; add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"; -specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/; +specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; +$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"; -specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/; +specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; +$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon; add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/vp9_iht16x16_256_add sse2 dspr2/; @@ -360,29 +402,25 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # variance add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x16/, "$sse2_x86inc", "$avx2_x86inc"; +specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32/, "$sse2_x86inc", "$avx2_x86inc"; +specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x32/, "$sse2_x86inc", "$avx2_x86inc"; +specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc"; +specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc"; - -add_proto qw/void vp9_get_sse_sum_16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get_sse_sum_16x16 sse2/; -$vp9_get_sse_sum_16x16_sse2=vp9_get16x16var_sse2; +specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; @@ -391,11 +429,13 @@ add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc"; + +add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc"; -add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get_sse_sum_8x8 sse2/; -$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2; +add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance8x4/, "$sse2_x86inc"; @@ -437,13 +477,13 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -461,7 +501,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_p specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance8x8 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -486,111 +526,84 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; -add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp9_sad64x64/, "$sse2_x86inc"; +add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vp9_sad64x64 neon/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad32x64/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad64x32/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad32x16/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad16x32/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp9_sad32x32/, "$sse2_x86inc"; +add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc"; +add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vp9_sad16x16 mmx neon/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc"; +add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; +specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad8x4/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad4x8/, "$sse_x86inc"; -add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc"; -add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc"; -add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad4x8_avg/, "$sse_x86inc"; -add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_sad4x4_avg/, "$sse_x86inc"; -add_proto qw/unsigned int vp9_variance_halfpixvar16x16_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar16x16_h/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance_halfpixvar16x16_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar16x16_v/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance_halfpixvar16x16_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar16x16_hv/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance_halfpixvar64x64_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar64x64_h/; - -add_proto qw/unsigned int vp9_variance_halfpixvar64x64_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar64x64_v/; - -add_proto qw/unsigned int vp9_variance_halfpixvar64x64_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar64x64_hv/; - -add_proto qw/unsigned int vp9_variance_halfpixvar32x32_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar32x32_h/; - -add_proto qw/unsigned int vp9_variance_halfpixvar32x32_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar32x32_v/; - -add_proto qw/unsigned int vp9_variance_halfpixvar32x32_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance_halfpixvar32x32_hv/; - add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad64x64x3/; @@ -679,11 +692,8 @@ specialize qw/vp9_sad4x8x4d sse/; add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad4x4x4d sse/; -#add_proto qw/unsigned int vp9_sub_pixel_mse16x16/, "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"; -#specialize qw/vp9_sub_pixel_mse16x16 sse2 mmx/; - add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc"; +specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_mse8x16/; @@ -694,21 +704,21 @@ specialize qw/vp9_mse16x8/; add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_mse8x8/; -add_proto qw/unsigned int vp9_sub_pixel_mse64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_mse64x64/; - -add_proto qw/unsigned int vp9_sub_pixel_mse32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_mse32x32/; - add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; specialize qw/vp9_get_mb_ss mmx sse2/; # ENCODEMB INVOKE add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"; -specialize qw/vp9_block_error/, "$sse2_x86inc"; +specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vp9_subtract_block/, "$sse2_x86inc"; +specialize qw/vp9_subtract_block neon/, "$sse2_x86inc"; + +add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64"; + +add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; @@ -729,25 +739,37 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { # fdct functions add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht4x4 sse2 avx2/; +specialize qw/vp9_fht4x4 sse2/; add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht8x8 sse2 avx2/; +specialize qw/vp9_fht8x8 sse2/; add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht16x16 sse2 avx2/; +specialize qw/vp9_fht16x16 sse2/; add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fwht4x4/; +specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; + +add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct4x4_1 sse2/; add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct4x4 sse2 avx2/; +specialize qw/vp9_fdct4x4 sse2/; + +add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct8x8_1 sse2 neon/; add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8 sse2 avx2/; +specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; + +add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct16x16_1 sse2/; add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct16x16 sse2 avx2/; +specialize qw/vp9_fdct16x16 sse2/; + +add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct32x32_1 sse2/; add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride"; specialize qw/vp9_fdct32x32 sse2 avx2/; @@ -758,23 +780,21 @@ specialize qw/vp9_fdct32x32_rd sse2 avx2/; # # Motion search # -add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv"; +add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv"; specialize qw/vp9_full_search_sad sse3 sse4_1/; $vp9_full_search_sad_sse3=vp9_full_search_sadx3; $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8; -add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"; -specialize qw/vp9_refining_search_sad sse3/; -$vp9_refining_search_sad_sse3=vp9_refining_search_sadx4; +add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; +specialize qw/vp9_refining_search_sad/; -add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad sse3/; -$vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4; +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; +specialize qw/vp9_diamond_search_sad/; -add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"; +add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_full_range_search/; -add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; +add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/vp9_temporal_filter_apply sse2/; } diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c index d3405fcdb..2f58323aa 100644 --- a/libvpx/vp9/common/vp9_scale.c +++ b/libvpx/vp9/common/vp9_scale.c @@ -33,14 +33,6 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) { return (other_size << REF_SCALE_SHIFT) / this_size; } -static int check_scale_factors(int other_w, int other_h, - int this_w, int this_h) { - return 2 * this_w >= other_w && - 2 * this_h >= other_h && - this_w <= 16 * other_w && - this_h <= 16 * other_h; -} - MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK; const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK; @@ -54,7 +46,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h) { - if (!check_scale_factors(other_w, other_h, this_w, this_h)) { + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { sf->x_scale_fp = REF_INVALID_SCALE; sf->y_scale_fp = REF_INVALID_SCALE; return; diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h index a9dda1889..ad6f5d702 100644 --- a/libvpx/vp9/common/vp9_scale.h +++ b/libvpx/vp9/common/vp9_scale.h @@ -46,8 +46,16 @@ static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { } static INLINE int vp9_is_scaled(const struct scale_factors *sf) { - return sf->x_scale_fp != REF_NO_SCALE || - sf->y_scale_fp != REF_NO_SCALE; + return vp9_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); +} + +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && + 2 * this_height >= ref_height && + this_width <= 16 * ref_width && + this_height <= 16 * ref_height; } #ifdef __cplusplus diff --git a/libvpx/vp9/common/vp9_tapify.py b/libvpx/vp9/common/vp9_tapify.py deleted file mode 100644 index 99529cff0..000000000 --- a/libvpx/vp9/common/vp9_tapify.py +++ /dev/null @@ -1,106 +0,0 @@ -""" - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. -""" -#!/usr/bin/env python -import sys,string,os,re,math,numpy -scale = 2**16 -def dist(p1,p2): - x1,y1 = p1 - x2,y2 = p2 - if x1==x2 and y1==y2 : - return 1.0 - return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)) - -def gettaps(p): - def l(b): - return int(math.floor(b)) - def h(b): - return int(math.ceil(b)) - def t(b,p,s): - return int((scale*dist(b,p)+s/2)/s) - r,c = p - ul=[l(r),l(c)] - ur=[l(r),h(c)] - ll=[h(r),l(c)] - lr=[h(r),h(c)] - sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p) - t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum); - return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)], - [ll,t(ll,p,sum)],[lr,t4]] - -def print_mb_taps(angle,blocksize): - theta = angle / 57.2957795; - affine = [[math.cos(theta),-math.sin(theta)], - [math.sin(theta),math.cos(theta)]] - radius = (float(blocksize)-1)/2 - print " // angle of",angle,"degrees" - for y in range(blocksize) : - for x in range(blocksize) : - r,c = numpy.dot(affine,[y-radius, x-radius]) - tps = gettaps([r+radius,c+radius]) - for t in tps : - p,t = t - tr,tc = p - print " %2d, %2d, %5d, " % (tr,tc,t,), - print " // %2d,%2d " % (y,x) - -i=float(sys.argv[1]) -while i <= float(sys.argv[2]) : - print_mb_taps(i,float(sys.argv[4])) - i=i+float(sys.argv[3]) -""" - -taps = [] -pt=dict() -ptr=dict() -for y in range(16) : - for x in range(16) : - r,c = numpy.dot(affine,[y-7.5, x-7.5]) - tps = gettaps([r+7.5,c+7.5]) - j=0 - for tp in tps : - p,i = tp - r,c = p - pt[y,x,j]= [p,i] - try: - ptr[r,j,c].append([y,x]) - except: - ptr[r,j,c]=[[y,x]] - j = j+1 - -for key in sorted(pt.keys()) : - print key,pt[key] - -lr = -99 -lj = -99 -lc = 0 - -shuf="" -mask="" -for r,j,c in sorted(ptr.keys()) : - for y,x in ptr[r,j,c] : - if lr != r or lj != j : - print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc - shuf="" - lc = 0 - for i in range(lc,c-1) : - shuf = shuf +"0" - shuf = shuf + hex(x)[2] - lc =c - break - lr = r - lj = j -# print r,j,c,ptr[r,j,c] -# print - -for r,j,c in sorted(ptr.keys()) : - for y,x in ptr[r,j,c] : - print r,j,c,y,x - break -""" diff --git a/libvpx/vp9/common/vp9_thread.c b/libvpx/vp9/common/vp9_thread.c new file mode 100644 index 000000000..1c6aec032 --- /dev/null +++ b/libvpx/vp9/common/vp9_thread.c @@ -0,0 +1,184 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// http://git.chromium.org/webm/libwebp.git +// 100644 blob 264210ba2807e4da47eb5d18c04cf869d89b9784 src/utils/thread.c + +#include <assert.h> +#include <string.h> // for memset() +#include "./vp9_thread.h" +#include "vpx_mem/vpx_mem.h" + +#if CONFIG_MULTITHREAD + +struct VP9WorkerImpl { + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +}; + +//------------------------------------------------------------------------------ + +static void execute(VP9Worker *const worker); // Forward declaration. + +static THREADFN thread_loop(void *ptr) { + VP9Worker *const worker = (VP9Worker*)ptr; + int done = 0; + while (!done) { + pthread_mutex_lock(&worker->impl_->mutex_); + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + if (worker->status_ == WORK) { + execute(worker); + worker->status_ = OK; + } else if (worker->status_ == NOT_OK) { // finish the worker + done = 1; + } + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + pthread_mutex_unlock(&worker->impl_->mutex_); + } + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void change_state(VP9Worker *const worker, + VP9WorkerStatus new_status) { + // No-op when attempting to change state on a thread that didn't come up. + // Checking status_ without acquiring the lock first would result in a data + // race. + if (worker->impl_ == NULL) return; + + pthread_mutex_lock(&worker->impl_->mutex_); + if (worker->status_ >= OK) { + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->impl_->condition_); + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); +} + +#endif // CONFIG_MULTITHREAD + +//------------------------------------------------------------------------------ + +static void init(VP9Worker *const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +static int sync(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +static int reset(VP9Worker *const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#if CONFIG_MULTITHREAD + worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_)); + if (worker->impl_ == NULL) { + return 0; + } + if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + goto Error; + } + if (pthread_cond_init(&worker->impl_->condition_, NULL)) { + pthread_mutex_destroy(&worker->impl_->mutex_); + goto Error; + } + pthread_mutex_lock(&worker->impl_->mutex_); + ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->impl_->mutex_); + if (!ok) { + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + Error: + vpx_free(worker->impl_); + worker->impl_ = NULL; + return 0; + } +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = sync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +static void execute(VP9Worker *const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + +static void launch(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, WORK); +#else + execute(worker); +#endif +} + +static void end(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + if (worker->impl_ != NULL) { + change_state(worker, NOT_OK); + pthread_join(worker->impl_->thread_, NULL); + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + vpx_free(worker->impl_); + worker->impl_ = NULL; + } +#else + worker->status_ = NOT_OK; + assert(worker->impl_ == NULL); +#endif + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +static VP9WorkerInterface g_worker_interface = { + init, reset, sync, launch, execute, end +}; + +int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) { + if (winterface == NULL || + winterface->init == NULL || winterface->reset == NULL || + winterface->sync == NULL || winterface->launch == NULL || + winterface->execute == NULL || winterface->end == NULL) { + return 0; + } + g_worker_interface = *winterface; + return 1; +} + +const VP9WorkerInterface *vp9_get_worker_interface(void) { + return &g_worker_interface; +} + +//------------------------------------------------------------------------------ diff --git a/libvpx/vp9/decoder/vp9_thread.h b/libvpx/vp9/common/vp9_thread.h index 2f8728dcf..864579c03 100644 --- a/libvpx/vp9/decoder/vp9_thread.h +++ b/libvpx/vp9/common/vp9_thread.h @@ -11,8 +11,7 @@ // // Original source: // http://git.chromium.org/webm/libwebp.git -// 100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d src/utils/thread.h - +// 100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38 src/utils/thread.h #ifndef VP9_DECODER_VP9_THREAD_H_ #define VP9_DECODER_VP9_THREAD_H_ @@ -163,40 +162,53 @@ typedef enum { // arguments (data1 and data2), and should return false in case of error. typedef int (*VP9WorkerHook)(void*, void*); -// Synchronize object used to launch job in the worker thread +// Platform-dependent implementation details for the worker. +typedef struct VP9WorkerImpl VP9WorkerImpl; + +// Synchronization object used to launch job in the worker thread typedef struct { -#if CONFIG_MULTITHREAD - pthread_mutex_t mutex_; - pthread_cond_t condition_; - pthread_t thread_; -#endif + VP9WorkerImpl *impl_; VP9WorkerStatus status_; VP9WorkerHook hook; // hook to call - void* data1; // first argument passed to 'hook' - void* data2; // second argument passed to 'hook' + void *data1; // first argument passed to 'hook' + void *data2; // second argument passed to 'hook' int had_error; // return value of the last call to 'hook' } VP9Worker; -// Must be called first, before any other method. -void vp9_worker_init(VP9Worker* const worker); -// Must be called to initialize the object and spawn the thread. Re-entrant. -// Will potentially launch the thread. Returns false in case of error. -int vp9_worker_reset(VP9Worker* const worker); -// Makes sure the previous work is finished. Returns true if worker->had_error -// was not set and no error condition was triggered by the working thread. -int vp9_worker_sync(VP9Worker* const worker); -// Triggers the thread to call hook() with data1 and data2 argument. These -// hook/data1/data2 can be changed at any time before calling this function, -// but not be changed afterward until the next call to vp9_worker_sync(). -void vp9_worker_launch(VP9Worker* const worker); -// This function is similar to vp9_worker_launch() except that it calls the -// hook directly instead of using a thread. Convenient to bypass the thread -// mechanism while still using the VP9Worker structs. vp9_worker_sync() must -// still be called afterward (for error reporting). -void vp9_worker_execute(VP9Worker* const worker); -// Kill the thread and terminate the object. To use the object again, one -// must call vp9_worker_reset() again. -void vp9_worker_end(VP9Worker* const worker); +// The interface for all thread-worker related functions. All these functions +// must be implemented. +typedef struct { + // Must be called first, before any other method. + void (*init)(VP9Worker *const worker); + // Must be called to initialize the object and spawn the thread. Re-entrant. + // Will potentially launch the thread. Returns false in case of error. + int (*reset)(VP9Worker *const worker); + // Makes sure the previous work is finished. Returns true if worker->had_error + // was not set and no error condition was triggered by the working thread. + int (*sync)(VP9Worker *const worker); + // Triggers the thread to call hook() with data1 and data2 arguments. These + // hook/data1/data2 values can be changed at any time before calling this + // function, but not be changed afterward until the next call to Sync(). + void (*launch)(VP9Worker *const worker); + // This function is similar to launch() except that it calls the + // hook directly instead of using a thread. Convenient to bypass the thread + // mechanism while still using the VP9Worker structs. sync() must + // still be called afterward (for error reporting). + void (*execute)(VP9Worker *const worker); + // Kill the thread and terminate the object. To use the object again, one + // must call reset() again. + void (*end)(VP9Worker *const worker); +} VP9WorkerInterface; + +// Install a new set of threading functions, overriding the defaults. This +// should be done before any workers are started, i.e., before any encoding or +// decoding takes place. The contents of the interface struct are copied, it +// is safe to free the corresponding memory after this call. This function is +// not thread-safe. Return false in case of invalid pointer or methods. +int vp9_set_worker_interface(const VP9WorkerInterface *const winterface); + +// Retrieve the currently set thread worker interface. +const VP9WorkerInterface *vp9_get_worker_interface(void); //------------------------------------------------------------------------------ diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c index 78909dd9b..8c4a30353 100644 --- a/libvpx/vp9/common/vp9_tile_common.c +++ b/libvpx/vp9/common/vp9_tile_common.c @@ -21,13 +21,21 @@ static int get_tile_offset(int idx, int mis, int log2) { return MIN(offset, mis); } -void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { +void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) { tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); +} + +void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) { tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); } +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { + vp9_tile_set_row(tile, cm, row); + vp9_tile_set_col(tile, cm, col); +} + void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols) { const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h index a97719e29..ae58805de 100644 --- a/libvpx/vp9/common/vp9_tile_common.h +++ b/libvpx/vp9/common/vp9_tile_common.h @@ -27,6 +27,9 @@ typedef struct TileInfo { void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, int row, int col); +void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row); +void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col); + void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols); diff --git a/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/libvpx/vp9/common/x86/vp9_copy_sse2.asm index dd522c698..b26383708 100644 --- a/libvpx/vp9/common/x86/vp9_copy_sse2.asm +++ b/libvpx/vp9/common/x86/vp9_copy_sse2.asm @@ -133,10 +133,14 @@ INIT_MMX sse movh m3, [srcq+r5q] lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] - pavgb m2, [dstq+dst_strideq*2] - pavgb m3, [dstq+r6q] + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavgb m0, m4 + pavgb m1, m5 + pavgb m2, m6 + pavgb m3, m7 %endif movh [dstq ], m0 movh [dstq+dst_strideq ], m1 diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 13a5b5a82..b60f8a06d 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -8,12 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <assert.h> -#include <emmintrin.h> // SSE2 -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_idct.h" +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" #define RECON_AND_STORE4X4(dest, in_x) \ { \ @@ -380,15 +375,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ { \ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ @@ -527,16 +513,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, out7 = _mm_subs_epi16(stp1_0, stp2_7); \ } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - dest += stride; \ - } - void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -627,53 +603,6 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, dc_value); } -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - static void idct8_sse2(__m128i *in) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); @@ -995,7 +924,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -1573,23 +1502,6 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { } } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static void iadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; @@ -2416,82 +2328,6 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) { iadst16_8col(in1); } -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); -} - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1<<5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest, in[0]); - RECON_AND_STORE(dest, in[1]); - RECON_AND_STORE(dest, in[2]); - RECON_AND_STORE(dest, in[3]); - RECON_AND_STORE(dest, in[4]); - RECON_AND_STORE(dest, in[5]); - RECON_AND_STORE(dest, in[6]); - RECON_AND_STORE(dest, in[7]); - RECON_AND_STORE(dest, in[8]); - RECON_AND_STORE(dest, in[9]); - RECON_AND_STORE(dest, in[10]); - RECON_AND_STORE(dest, in[11]); - RECON_AND_STORE(dest, in[12]); - RECON_AND_STORE(dest, in[13]); - RECON_AND_STORE(dest, in[14]); - RECON_AND_STORE(dest, in[15]); -} - void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in0[16], in1[16]; diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h new file mode 100644 index 000000000..0f179b49a --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + dest += stride; \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); +} diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c b/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c new file mode 100644 index 000000000..73bf5d1d7 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#if defined(_MSC_VER) && _MSC_VER <= 1500 +// Need to include math.h before calling tmmintrin.h/intrin.h +// in certain versions of MSVS. +#include <math.h> +#endif +#include <tmmintrin.h> // SSSE3 +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" + +static void idct16_8col(__m128i *in, int round) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_add_epi16(t[0], t[1]); + u[1] = _mm_sub_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_sub_epi16(s[6], s[5]); + u[1] = _mm_add_epi16(s[6], s[5]); + t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + if (round == 1) { + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + } else { + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_sub_epi16(t[13], t[10]); + u[1] = _mm_add_epi16(t[13], t[10]); + u[2] = _mm_sub_epi16(t[12], t[11]); + u[3] = _mm_add_epi16(t[12], t[11]); + + s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2); + s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2); + s[14] = t[14]; + s[15] = t[15]; + } + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +static void idct16_sse2(__m128i *in0, __m128i *in1, int round) { + array_transpose_16x16(in0, in1); + idct16_8col(in0, round); + idct16_8col(in1, round); +} + +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, + int stride) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + idct16_sse2(in0, in1, 0); + idct16_sse2(in0, in1, 1); + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} + +static void idct16_10_r1(__m128i *in, __m128i *l) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_01 = dual_set_epi16(3212, 32610); + const __m128i stg2_67 = dual_set_epi16(-9512, 31358); + const __m128i stg3_01 = dual_set_epi16(6392, 32138); + const __m128i stg4_01 = dual_set_epi16(23170, 23170); + + + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + __m128i stp1_0, stp1_1, stp1_4, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]); + const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]); + + stp2_8 = _mm_mulhrs_epi16(lo_1_15, stg2_01); + stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]); + stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp4 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + + stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0); + stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp2, tmp4); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4); + const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10); + const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10); + const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11); + const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11); + + tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6); + tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14); + tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13); + + stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01); + tmp0 = _mm_mulhrs_epi16(tmp0, stg4_01); + tmp4 = _mm_mulhrs_epi16(tmp4, stg4_01); + + stp2_10 = _mm_unpacklo_epi64(tmp0, zero); + stp2_13 = _mm_unpackhi_epi64(tmp0, zero); + stp2_11 = _mm_unpacklo_epi64(tmp4, zero); + stp2_12 = _mm_unpackhi_epi64(tmp4, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); +} + +static void idct16_10_r2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + const __m128i stg2_0 = dual_set_epi16(3212, 3212); + const __m128i stg2_1 = dual_set_epi16(32610, 32610); + const __m128i stg2_6 = dual_set_epi16(-9512, -9512); + const __m128i stg2_7 = dual_set_epi16(31358, 31358); + const __m128i stg3_0 = dual_set_epi16(6392, 6392); + const __m128i stg3_1 = dual_set_epi16(32138, 32138); + const __m128i stg4_01 = dual_set_epi16(23170, 23170); + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Stage2 */ + { + stp1_8_0 = _mm_mulhrs_epi16(in[1], stg2_0); + stp1_15 = _mm_mulhrs_epi16(in[1], stg2_1); + stp1_11 = _mm_mulhrs_epi16(in[3], stg2_6); + stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7); + } + + /* Stage3 */ + { + stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0); + stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1); + + stp1_9 = stp1_8_0; + stp1_10 = stp1_11; + + stp1_13 = stp1_12_0; + stp1_14 = stp1_15; + } + + /* Stage4 */ + { + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01); + + stp2_5 = stp2_4; + stp2_6 = stp2_7; + + + tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); + tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); + tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); + tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); + tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp2_9 = _mm_packs_epi32(tmp0, tmp1); + stp2_14 = _mm_packs_epi32(tmp2, tmp3); + stp2_10 = _mm_packs_epi32(tmp4, tmp5); + stp2_13 = _mm_packs_epi32(tmp6, tmp7); + } + + /* Stage5 */ + { + stp1_2 = stp1_0; + stp1_3 = stp1_0; + + tmp0 = _mm_sub_epi16(stp2_6, stp2_5); + tmp1 = _mm_add_epi16(stp2_6, stp2_5); + + stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01); + stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01); + + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + } + + /* Stage6 */ + { + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); + stp2_1 = _mm_add_epi16(stp1_0, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); + + tmp0 = _mm_sub_epi16(stp1_13, stp1_10); + tmp1 = _mm_add_epi16(stp1_13, stp1_10); + tmp2 = _mm_sub_epi16(stp1_12, stp1_11); + tmp3 = _mm_add_epi16(stp1_12, stp1_11); + + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_0, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + + stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01); + stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01); + stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01); + stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01); + } + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); +} + +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + __m128i in[16], l[16]; + + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + idct16_10_r1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + array_transpose_4X8(l + 8*i, in); + + idct16_10_r2(in); + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + + dest += 8 - (stride * 16); + } +} diff --git a/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm b/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm new file mode 100644 index 000000000..2c1060710 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm @@ -0,0 +1,300 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the inverse transformation. Part +; of the functions are originally derived from the ffmpeg project. +; Note that the current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 +pw_16: times 8 dw 16 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 +%endmacro + +TRANSFORM_COEFFS 6270, 15137 +TRANSFORM_COEFFS 3196, 16069 +TRANSFORM_COEFFS 13623, 9102 + +%macro PAIR_PP_COEFFS 2 +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MP_COEFFS 2 +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MM_COEFFS 2 +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 +%endmacro + +PAIR_PP_COEFFS 30274, 12540 +PAIR_PP_COEFFS 6392, 32138 +PAIR_MP_COEFFS 18204, 27246 + +PAIR_PP_COEFFS 12540, 12540 +PAIR_PP_COEFFS 30274, 30274 +PAIR_PP_COEFFS 6392, 6392 +PAIR_PP_COEFFS 32138, 32138 +PAIR_MM_COEFFS 18204, 18204 +PAIR_PP_COEFFS 27246, 27246 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro IDCT8_1D 0 + SUM_SUB 0, 4, 9 + BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 + + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + SUM_SUB 0, 6, 9 + SUM_SUB 4, 2, 9 + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 4, 3, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 4 +%endmacro + +; This macro handles 8 pixels per line +%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero + paddw m%1, m11 + paddw m%2, m11 + psraw m%1, 5 + psraw m%2, 5 + + movh m%3, [outputq] + movh m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [outputq], m%3 + movh [outputq + strideq], m%4 +%endmacro + +INIT_XMM ssse3 +; full inverse 8x8 2D-DCT transform +cglobal idct8x8_64_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + mova m4, [inputq + 64] + mova m5, [inputq + 80] + mova m6, [inputq + 96] + mova m7, [inputq + 112] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m9, m0, m2 + punpckldq m0, m2 + SWAP 2, 9 + + ; m0 -> [0], [0] + ; m1 -> [1], [1] + ; m2 -> [2], [2] + ; m3 -> [3], [3] + punpckhqdq m10, m0, m0 + punpcklqdq m0, m0 + punpckhqdq m9, m2, m2 + punpcklqdq m2, m2 + SWAP 1, 10 + SWAP 3, 9 + + pmulhrsw m0, m12 + pmulhrsw m2, [dpw_30274_12540] + pmulhrsw m1, [dpw_6392_32138] + pmulhrsw m3, [dpw_m18204_27246] + + SUM_SUB 0, 2, 9 + SUM_SUB 1, 3, 9 + + punpcklqdq m9, m3, m3 + punpckhqdq m5, m3, m9 + + SUM_SUB 3, 5, 9 + punpckhqdq m5, m3 + pmulhrsw m5, m12 + + punpckhqdq m9, m1, m5 + punpcklqdq m1, m5 + SWAP 5, 9 + + SUM_SUB 0, 5, 9 + SUM_SUB 2, 1, 9 + + punpckhqdq m3, m0, m0 + punpckhqdq m4, m1, m1 + punpckhqdq m6, m5, m5 + punpckhqdq m7, m2, m2 + + punpcklwd m0, m3 + punpcklwd m7, m2 + punpcklwd m1, m4 + punpcklwd m6, m5 + + punpckhdq m4, m0, m7 + punpckldq m0, m7 + punpckhdq m10, m1, m6 + punpckldq m5, m1, m6 + + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m3, m4, m10 + punpcklqdq m2, m4, m10 + + + pmulhrsw m0, m12 + pmulhrsw m6, m2, [dpw_30274_30274] + pmulhrsw m4, m2, [dpw_12540_12540] + + pmulhrsw m7, m1, [dpw_32138_32138] + pmulhrsw m1, [dpw_6392_6392] + pmulhrsw m5, m3, [dpw_m18204_m18204] + pmulhrsw m3, [dpw_27246_27246] + + mova m2, m0 + SUM_SUB 0, 6, 9 + SUM_SUB 2, 4, 9 + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 2, 3, 9 + SUM_SUB 4, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 2 + SWAP 2, 4 + + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +%endif diff --git a/libvpx/vp9/common/x86/vp9_postproc_x86.h b/libvpx/vp9/common/x86/vp9_postproc_x86.h deleted file mode 100644 index cab9d34f2..000000000 --- a/libvpx/vp9/common/x86/vp9_postproc_x86.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_ -#define VP9_COMMON_X86_VP9_POSTPROC_X86_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx); -extern prototype_postproc(vp9_post_proc_down_and_across_mmx); -extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_postproc_down -#define vp9_postproc_down vp9_mbpost_proc_down_mmx - -#undef vp9_postproc_downacross -#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx - -#undef vp9_postproc_addnoise -#define vp9_postproc_addnoise vp9_plane_add_noise_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm); -extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm); -extern prototype_postproc(vp9_post_proc_down_and_across_xmm); -extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_postproc_down -#define vp9_postproc_down vp9_mbpost_proc_down_xmm - -#undef vp9_postproc_across -#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm - -#undef vp9_postproc_downacross -#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm - -#undef vp9_postproc_addnoise -#define vp9_postproc_addnoise vp9_plane_add_noise_wmt - - -#endif -#endif - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_COMMON_X86_VP9_POSTPROC_X86_H_ diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c index b84db970e..d109e136a 100644 --- a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -111,21 +111,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together @@ -146,21 +146,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together @@ -208,26 +208,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt2Reg)); + _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(secondFilters)); + _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt4Reg)); + _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(forthFilters)); + _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); @@ -247,26 +247,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt2Reg)); + _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(secondFilters)); + _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt4Reg)); + _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(forthFilters)); + _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c index cf28d8d2b..c4efa6565 100644 --- a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -44,7 +44,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, unsigned int output_pitch, unsigned int output_height, int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; @@ -61,20 +61,22 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, // duplicate only the third 16 bit in the filter into the first lane secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters - thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); - forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); + shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); - srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); + srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); @@ -164,12 +166,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); + srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits @@ -229,21 +231,21 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); + srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together @@ -260,21 +262,21 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, // filter the source buffer srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); + srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index 634fa7746..fd781d4bc 100644 --- a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -272,22 +272,23 @@ punpcklbw xmm2, xmm3 ;C D punpcklbw xmm4, xmm5 ;E F - movq xmm6, [rsi + rbx + 8] ;G movq xmm7, [rax + rbx + 8] ;H punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm0, k0k1 pmaddubsw xmm2, k2k3 pmaddubsw xmm4, k4k5 pmaddubsw xmm6, k6k7 paddsw xmm0, xmm6 - paddsw xmm0, xmm2 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 paddsw xmm0, xmm4 - paddsw xmm0, krd + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c index 9b63961f0..07971687c 100644 --- a/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/libvpx/vp9/decoder/vp9_decodeframe.c @@ -28,6 +28,7 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_thread.h" #include "vp9/common/vp9_tile_common.h" #include "vp9/decoder/vp9_decodeframe.h" @@ -38,7 +39,8 @@ #include "vp9/decoder/vp9_dthread.h" #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/decoder/vp9_reader.h" -#include "vp9/decoder/vp9_thread.h" + +#define MAX_VP9_HEADER_SIZE 80 static int is_compound_reference_allowed(const VP9_COMMON *cm) { int i; @@ -192,31 +194,33 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, int eob) { struct macroblockd_plane *const pd = &xd->plane[plane]; if (eob > 0) { - TX_TYPE tx_type; - const PLANE_TYPE plane_type = pd->plane_type; + TX_TYPE tx_type = DCT_DCT; int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - switch (tx_size) { - case TX_4X4: - tx_type = get_tx_type_4x4(plane_type, xd, block); - if (tx_type == DCT_DCT) - xd->itxm_add(dqcoeff, dst, stride, eob); - else - vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type); - break; - case TX_8X8: - tx_type = get_tx_type(plane_type, xd); - vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); - break; - case TX_16X16: - tx_type = get_tx_type(plane_type, xd); - vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); - break; - case TX_32X32: - tx_type = DCT_DCT; - vp9_idct32x32_add(dqcoeff, dst, stride, eob); - break; - default: - assert(0 && "Invalid transform size"); + if (xd->lossless) { + tx_type = DCT_DCT; + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + const PLANE_TYPE plane_type = pd->plane_type; + switch (tx_size) { + case TX_4X4: + tx_type = get_tx_type_4x4(plane_type, xd, block); + vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_8X8: + tx_type = get_tx_type(plane_type, xd); + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + tx_type = get_tx_type(plane_type, xd); + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + tx_type = DCT_DCT; + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + } } if (eob == 1) { @@ -246,8 +250,8 @@ static void predict_and_reconstruct_intra_block(int plane, int block, MACROBLOCKD *const xd = args->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; MODE_INFO *const mi = xd->mi[0]; - const MB_PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block) - : mi->mbmi.uv_mode; + const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block) + : mi->mbmi.uv_mode; int x, y; uint8_t *dst; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); @@ -314,7 +318,7 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, // as they are always compared to values that are in 1/8th pel units set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); - vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col); + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); return &xd->mi[0]->mbmi; } @@ -406,13 +410,17 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader* r, BLOCK_SIZE bsize) { const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition; - BLOCK_SIZE subsize; + BLOCK_SIZE subsize, uv_subsize; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); subsize = get_subsize(bsize, partition); + uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y]; + if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); if (subsize < BLOCK_8X8) { decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); } else { @@ -451,7 +459,9 @@ static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, size_t read_size, struct vpx_internal_error_info *error_info, - vp9_reader *r) { + vp9_reader *r, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { // Validate the calculated partition length. If the buffer // described by the partition can't be fully read, then restrict // it to the portion that can be (for EC mode) or throw an error. @@ -459,7 +469,7 @@ static void setup_token_decoder(const uint8_t *data, vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); - if (vp9_reader_init(r, data, read_size)) + if (vp9_reader_init(r, data, read_size, decrypt_cb, decrypt_state)) vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder %d", 1); } @@ -584,8 +594,6 @@ static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - - xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; } static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { @@ -597,8 +605,8 @@ static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { : literal_to_filter[vp9_rb_read_literal(rb, 2)]; } -static void read_frame_size(struct vp9_read_bit_buffer *rb, - int *width, int *height) { +void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, + int *width, int *height) { const int w = vp9_rb_read_literal(rb, 16) + 1; const int h = vp9_rb_read_literal(rb, 16) + 1; *width = w; @@ -609,25 +617,40 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { cm->display_width = cm->width; cm->display_height = cm->height; if (vp9_rb_read_bit(rb)) - read_frame_size(rb, &cm->display_width, &cm->display_height); + vp9_read_frame_size(rb, &cm->display_width, &cm->display_height); } -static void apply_frame_size(VP9_COMMON *cm, int width, int height) { +static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Width and height beyond allowed size."); +#endif if (cm->width != width || cm->height != height) { - // Change in frame size. - // TODO(agrange) Don't test width/height, check overall size. - if (width > cm->width || height > cm->height) { - // Rescale frame buffers only if they're not big enough already. - if (vp9_resize_frame_buffers(cm, width, height)) + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + + // Change in frame size (assumption: color format does not change). + if (cm->width == 0 || cm->height == 0 || + aligned_width > cm->width || + aligned_width * aligned_height > cm->width * cm->height) { + if (vp9_alloc_context_buffers(cm, width, height)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); + } else { + vp9_set_mb_mi(cm, width, height); } - + vp9_init_context_buffers(cm); cm->width = width; cm->height = height; - - vp9_update_frame_size(cm); } +} + +static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { + int width, height; + vp9_read_frame_size(rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_display_size(cm, rb); if (vp9_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, @@ -639,17 +662,11 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) { } } -static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - int width, height; - read_frame_size(rb, &width, &height); - apply_frame_size(cm, width, height); - setup_display_size(cm, rb); -} - static void setup_frame_size_with_refs(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { int width, height; int found = 0, i; + int has_valid_ref_frame = 0; for (i = 0; i < REFS_PER_FRAME; ++i) { if (vp9_rb_read_bit(rb)) { YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; @@ -661,71 +678,34 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, } if (!found) - read_frame_size(rb, &width, &height); + vp9_read_frame_size(rb, &width, &height); - if (width <= 0 || height <= 0) + if (width <=0 || height <= 0) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Referenced frame with invalid size"); - - apply_frame_size(cm, width, height); - setup_display_size(cm, rb); -} - -static void decode_tile(VP9Decoder *pbi, const TileInfo *const tile, - vp9_reader *r) { - const int num_threads = pbi->oxcf.max_threads; - VP9_COMMON *const cm = &pbi->common; - int mi_row, mi_col; - MACROBLOCKD *xd = &pbi->mb; - - if (pbi->do_loopfilter_inline) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - lf_data->frame_buffer = get_frame_new_buffer(cm); - lf_data->cm = cm; - lf_data->xd = pbi->mb; - lf_data->stop = 0; - lf_data->y_only = 0; - vp9_loop_filter_frame_init(cm, cm->lf.filter_level); - } - - for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; - mi_row += MI_BLOCK_SIZE) { - // For a SB there are 2 left contexts, each pertaining to a MB row within - vp9_zero(xd->left_context); - vp9_zero(xd->left_seg_context); - for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) { - decode_partition(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64); - } - - if (pbi->do_loopfilter_inline) { - const int lf_start = mi_row - MI_BLOCK_SIZE; - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - - // delay the loopfilter by 1 macroblock row. - if (lf_start < 0) continue; + "Invalid frame size"); - // decoding has completed: finish up the loop filter in this thread. - if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue; - - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_start; - lf_data->stop = mi_row; - if (num_threads > 1) { - vp9_worker_launch(&pbi->lf_worker); - } else { - vp9_worker_execute(&pbi->lf_worker); - } - } + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width, + ref_frame->buf->y_crop_height, + width, height); } + if (!has_valid_ref_frame) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); - if (pbi->do_loopfilter_inline) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + resize_context_buffers(cm, width, height); + setup_display_size(cm, rb); - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_data->stop; - lf_data->stop = cm->mi_rows; - vp9_worker_execute(&pbi->lf_worker); + if (vp9_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS, + &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, + cm->cb_priv)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); } } @@ -739,18 +719,30 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { while (max_ones-- && vp9_rb_read_bit(rb)) cm->log2_tile_cols++; + if (cm->log2_tile_cols > 6) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid number of tile columns"); + // rows cm->log2_tile_rows = vp9_rb_read_bit(rb); if (cm->log2_tile_rows) cm->log2_tile_rows += vp9_rb_read_bit(rb); } +typedef struct TileBuffer { + const uint8_t *data; + size_t size; + int col; // only used with multi-threaded decoding +} TileBuffer; + // Reads the next tile returning its size and adjusting '*data' accordingly // based on 'is_last'. -static size_t get_tile(const uint8_t *const data_end, - int is_last, - struct vpx_internal_error_info *error_info, - const uint8_t **data) { +static void get_tile_buffer(const uint8_t *const data_end, + int is_last, + struct vpx_internal_error_info *error_info, + const uint8_t **data, + vpx_decrypt_cb decrypt_cb, void *decrypt_state, + TileBuffer *buf) { size_t size; if (!is_last) { @@ -758,7 +750,13 @@ static size_t get_tile(const uint8_t *const data_end, vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); - size = mem_get_be32(*data); + if (decrypt_cb) { + uint8_t be_data[4]; + decrypt_cb(decrypt_state, *data, be_data, 4); + size = mem_get_be32(be_data); + } else { + size = mem_get_be32(*data); + } *data += 4; if (size > (size_t)(data_end - *data)) @@ -767,26 +765,62 @@ static size_t get_tile(const uint8_t *const data_end, } else { size = data_end - *data; } - return size; + + buf->data = *data; + buf->size = size; + + *data += size; } -typedef struct TileBuffer { - const uint8_t *data; - size_t size; - int col; // only used with multi-threaded decoding -} TileBuffer; +static void get_tile_buffers(VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + int tile_cols, int tile_rows, + TileBuffer (*tile_buffers)[1 << 6]) { + int r, c; + + for (r = 0; r < tile_rows; ++r) { + for (c = 0; c < tile_cols; ++c) { + const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1); + TileBuffer *const buf = &tile_buffers[r][c]; + buf->col = c; + get_tile_buffer(data_end, is_last, &pbi->common.error, &data, + pbi->decrypt_cb, pbi->decrypt_state, buf); + } + } +} static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; TileBuffer tile_buffers[4][1 << 6]; int tile_row, tile_col; - const uint8_t *end = NULL; - vp9_reader r; + int mi_row, mi_col; + TileData *tile_data = NULL; + + if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + vpx_memalign(32, sizeof(LFWorkerData))); + pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + if (cm->lf.filter_level) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + lf_data->frame_buffer = get_frame_new_buffer(cm); + lf_data->cm = cm; + vp9_copy(lf_data->planes, pbi->mb.plane); + lf_data->stop = 0; + lf_data->y_only = 0; + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); + } assert(tile_rows <= 4); assert(tile_cols <= (1 << 6)); @@ -799,39 +833,91 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) * aligned_cols); - // Load tile data into tile_buffers - for (tile_row = 0; tile_row < tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const int last_tile = tile_row == tile_rows - 1 && - tile_col == tile_cols - 1; - const size_t size = get_tile(data_end, last_tile, &cm->error, &data); - TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; - buf->data = data; - buf->size = size; - data += size; - } + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); + + if (pbi->tile_data == NULL || + (tile_cols * tile_rows) != pbi->total_tiles) { + vpx_free(pbi->tile_data); + CHECK_MEM_ERROR( + cm, + pbi->tile_data, + vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data)))); + pbi->total_tiles = tile_rows * tile_cols; } - // Decode tiles using data from tile_buffers + // Load all tile information into tile_data. for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1 - : tile_col; - const int last_tile = tile_row == tile_rows - 1 && - col == tile_cols - 1; - const TileBuffer *const buf = &tile_buffers[tile_row][col]; TileInfo tile; + const TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; + tile_data = pbi->tile_data + tile_cols * tile_row + tile_col; + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + init_macroblockd(cm, &tile_data->xd); + vp9_zero(tile_data->xd.dqcoeff); + } + } - vp9_tile_init(&tile, cm, tile_row, col); - setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r); - decode_tile(pbi, &tile, &r); - - if (last_tile) - end = vp9_reader_find_end(&r); + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + TileInfo tile; + vp9_tile_set_row(&tile, cm, tile_row); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int col = pbi->inv_tile_order ? + tile_cols - tile_col - 1 : tile_col; + tile_data = pbi->tile_data + tile_cols * tile_row + col; + vp9_tile_set_col(&tile, tile_data->cm, col); + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col, + &tile_data->bit_reader, BLOCK_64X64); + } + pbi->mb.corrupted |= tile_data->xd.corrupted; + } + // Loopfilter one row. + if (cm->lf.filter_level) { + const int lf_start = mi_row - MI_BLOCK_SIZE; + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + + // delay the loopfilter by 1 macroblock row. + if (lf_start < 0) continue; + + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue; + + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + if (pbi->max_threads > 1) { + winterface->launch(&pbi->lf_worker); + } else { + winterface->execute(&pbi->lf_worker); + } + } } } - return end; + // Loopfilter remaining rows in the frame. + if (cm->lf.filter_level) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_data->stop; + lf_data->stop = cm->mi_rows; + winterface->execute(&pbi->lf_worker); + } + + // Get last tile data. + tile_data = pbi->tile_data + tile_cols * tile_rows - 1; + + return vp9_reader_find_end(&tile_data->bit_reader); } static int tile_worker_hook(void *arg1, void *arg2) { @@ -869,12 +955,13 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); const uint8_t *bit_reader_end = NULL; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; - const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); - TileBuffer tile_buffers[1 << 6]; + const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); + TileBuffer tile_buffers[1][1 << 6]; int n; int final_worker = -1; @@ -885,7 +972,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, // TODO(jzern): See if we can remove the restriction of passing in max // threads to the decoder. if (pbi->num_tile_workers == 0) { - const int num_threads = pbi->oxcf.max_threads & ~1; + const int num_threads = pbi->max_threads & ~1; int i; // TODO(jzern): Allocate one less worker, as in the current code we only // use num_threads - 1 workers. @@ -895,11 +982,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, VP9Worker *const worker = &pbi->tile_workers[i]; ++pbi->num_tile_workers; - vp9_worker_init(worker); + winterface->init(worker); CHECK_MEM_ERROR(cm, worker->data1, vpx_memalign(32, sizeof(TileWorkerData))); CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); - if (i < num_threads - 1 && !vp9_worker_reset(worker)) { + if (i < num_threads - 1 && !winterface->reset(worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile decoder thread creation failed"); } @@ -919,18 +1006,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, sizeof(*cm->above_seg_context) * aligned_mi_cols); // Load tile data into tile_buffers - for (n = 0; n < tile_cols; ++n) { - const size_t size = - get_tile(data_end, n == tile_cols - 1, &cm->error, &data); - TileBuffer *const buf = &tile_buffers[n]; - buf->data = data; - buf->size = size; - buf->col = n; - data += size; - } + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); // Sort the buffers based on size in descending order. - qsort(tile_buffers, tile_cols, sizeof(tile_buffers[0]), compare_tile_buffers); + qsort(tile_buffers[0], tile_cols, sizeof(tile_buffers[0][0]), + compare_tile_buffers); // Rearrange the tile buffers such that per-tile group the largest, and // presumably the most difficult, tile will be decoded in the main thread. @@ -939,11 +1019,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, { int group_start = 0; while (group_start < tile_cols) { - const TileBuffer largest = tile_buffers[group_start]; + const TileBuffer largest = tile_buffers[0][group_start]; const int group_end = MIN(group_start + num_workers, tile_cols) - 1; - memmove(tile_buffers + group_start, tile_buffers + group_start + 1, - (group_end - group_start) * sizeof(tile_buffers[0])); - tile_buffers[group_end] = largest; + memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1, + (group_end - group_start) * sizeof(tile_buffers[0][0])); + tile_buffers[0][group_end] = largest; group_start = group_end + 1; } } @@ -955,22 +1035,23 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, VP9Worker *const worker = &pbi->tile_workers[i]; TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; TileInfo *const tile = (TileInfo*)worker->data2; - TileBuffer *const buf = &tile_buffers[n]; + TileBuffer *const buf = &tile_buffers[0][n]; tile_data->cm = cm; tile_data->xd = pbi->mb; tile_data->xd.corrupted = 0; vp9_tile_init(tile, tile_data->cm, 0, buf->col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, - &tile_data->bit_reader); + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); init_macroblockd(cm, &tile_data->xd); vp9_zero(tile_data->xd.dqcoeff); worker->had_error = 0; if (i == num_workers - 1 || n == tile_cols - 1) { - vp9_worker_execute(worker); + winterface->execute(worker); } else { - vp9_worker_launch(worker); + winterface->launch(worker); } if (buf->col == tile_cols - 1) { @@ -982,7 +1063,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, for (; i > 0; --i) { VP9Worker *const worker = &pbi->tile_workers[i - 1]; - pbi->mb.corrupted |= !vp9_worker_sync(worker); + pbi->mb.corrupted |= !winterface->sync(worker); } if (final_worker > -1) { TileWorkerData *const tile_data = @@ -995,26 +1076,59 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, return bit_reader_end; } -static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 || - vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 || - vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) { - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid frame sync code"); - } -} - static void error_handler(void *data) { VP9_COMMON *const cm = (VP9_COMMON *)data; vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); } -static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) { +int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) { + return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 && + vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 && + vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2; +} + +BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) { int profile = vp9_rb_read_bit(rb); profile |= vp9_rb_read_bit(rb) << 1; + if (profile > 2) + profile += vp9_rb_read_bit(rb); return (BITSTREAM_PROFILE) profile; } +static void read_bitdepth_colorspace_sampling( + VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { + if (cm->profile >= PROFILE_2) + cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10; + cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); + if (cm->color_space != SRGB) { + vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + cm->subsampling_x = vp9_rb_read_bit(rb); + cm->subsampling_y = vp9_rb_read_bit(rb); + if (cm->subsampling_x == 1 && cm->subsampling_y == 1) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:2:0 color not supported in profile 1 or 3"); + if (vp9_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + cm->subsampling_y = cm->subsampling_x = 1; + } + } else { + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed. + // 4:2:2 or 4:4:0 chroma sampling is not allowed. + cm->subsampling_y = cm->subsampling_x = 0; + if (vp9_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:4:4 color not supported in profile 0 or 2"); + } + } +} + static size_t read_uncompressed_header(VP9Decoder *pbi, struct vp9_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; @@ -1027,7 +1141,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame marker"); - cm->profile = read_profile(rb); + cm->profile = vp9_read_profile(rb); if (cm->profile >= MAX_PROFILES) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Unsupported bitstream profile"); @@ -1037,7 +1151,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, // Show an existing frame directly. const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; - if (cm->frame_bufs[frame_to_show].ref_count < 1) + if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Buffer %d does not contain a decoded frame", frame_to_show); @@ -1054,34 +1168,16 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, cm->error_resilient_mode = vp9_rb_read_bit(rb); if (cm->frame_type == KEY_FRAME) { - check_sync_code(cm, rb); - if (cm->profile > PROFILE_1) - cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10; - cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); - if (cm->color_space != SRGB) { - vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range - if (cm->profile >= PROFILE_1) { - cm->subsampling_x = vp9_rb_read_bit(rb); - cm->subsampling_y = vp9_rb_read_bit(rb); - vp9_rb_read_bit(rb); // has extra plane - } else { - cm->subsampling_y = cm->subsampling_x = 1; - } - } else { - if (cm->profile >= PROFILE_1) { - cm->subsampling_y = cm->subsampling_x = 0; - vp9_rb_read_bit(rb); // has extra plane - } else { - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "RGB not supported in profile 0"); - } - } + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + read_bitdepth_colorspace_sampling(cm, rb); pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; for (i = 0; i < REFS_PER_FRAME; ++i) { - cm->frame_refs[i].idx = cm->new_fb_idx; - cm->frame_refs[i].buf = get_frame_new_buffer(cm); + cm->frame_refs[i].idx = -1; + cm->frame_refs[i].buf = NULL; } setup_frame_size(cm, rb); @@ -1092,18 +1188,30 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, 0 : vp9_rb_read_literal(rb, 2); if (cm->intra_only) { - check_sync_code(cm, rb); + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + if (cm->profile > PROFILE_0) { + read_bitdepth_colorspace_sampling(cm, rb); + } else { + // NOTE: The intra-only frame header does not include the specification + // of either the color format or color sub-sampling in profile 0. VP9 + // specifies that the default color space should be YUV 4:2:0 in this + // case (normative). + cm->color_space = BT_601; + cm->subsampling_y = cm->subsampling_x = 1; + } pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); setup_frame_size(cm, rb); } else { pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); - for (i = 0; i < REFS_PER_FRAME; ++i) { const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2); const int idx = cm->ref_frame_map[ref]; - cm->frame_refs[i].idx = idx; - cm->frame_refs[i].buf = &cm->frame_bufs[idx].buf; + RefBuffer *const ref_frame = &cm->frame_refs[i]; + ref_frame->idx = idx; + ref_frame->buf = &cm->frame_bufs[idx].buf; cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb); } @@ -1125,11 +1233,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, } if (!cm->error_resilient_mode) { - cm->coding_use_prev_mi = 1; cm->refresh_frame_context = vp9_rb_read_bit(rb); cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb); } else { - cm->coding_use_prev_mi = 0; cm->refresh_frame_context = 0; cm->frame_parallel_decoding_mode = 1; } @@ -1163,7 +1269,8 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, vp9_reader r; int k; - if (vp9_reader_init(&r, data, partition_size)) + if (vp9_reader_init(&r, data, partition_size, pbi->decrypt_cb, + pbi->decrypt_state)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder 0"); @@ -1255,48 +1362,59 @@ static void debug_check_frame_counts(const VP9_COMMON *const cm) { } #endif // NDEBUG -int vp9_decode_frame(VP9Decoder *pbi, - const uint8_t *data, const uint8_t *data_end, - const uint8_t **p_data_end) { +static struct vp9_read_bit_buffer* init_read_bit_buffer( + VP9Decoder *pbi, + struct vp9_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t *data_end, + uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) { + rb->bit_offset = 0; + rb->error_handler = error_handler; + rb->error_handler_data = &pbi->common; + if (pbi->decrypt_cb) { + const int n = (int)MIN(MAX_VP9_HEADER_SIZE, data_end - data); + pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n); + rb->bit_buffer = clear_data; + rb->bit_buffer_end = clear_data + n; + } else { + rb->bit_buffer = data; + rb->bit_buffer_end = data_end; + } + return rb; +} + +void vp9_decode_frame(VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; + struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0}; - struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler }; - const size_t first_partition_size = read_uncompressed_header(pbi, &rb); - const int keyframe = cm->frame_type == KEY_FRAME; + uint8_t clear_data[MAX_VP9_HEADER_SIZE]; + const size_t first_partition_size = read_uncompressed_header(pbi, + init_read_bit_buffer(pbi, &rb, data, data_end, clear_data)); const int tile_rows = 1 << cm->log2_tile_rows; const int tile_cols = 1 << cm->log2_tile_cols; YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); xd->cur_buf = new_fb; if (!first_partition_size) { - // showing a frame directly - *p_data_end = data + 1; - return 0; + // showing a frame directly + *p_data_end = data + 1; + return; } - if (!pbi->decoded_key_frame && !keyframe) - return -1; - data += vp9_rb_bytes_read(&rb); if (!read_is_valid(data, first_partition_size, data_end)) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt header length"); - pbi->do_loopfilter_inline = - (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; - if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) { - CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, - vpx_memalign(32, sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; - if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Loop filter thread creation failed"); - } - } - init_macroblockd(cm, &pbi->mb); - cm->prev_mi = get_prev_mi(cm); + + if (!cm->error_resilient_mode) + set_prev_mi(cm); + else + cm->prev_mi = NULL; setup_plane_dequants(cm, xd, cm->base_qindex); vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); @@ -1310,36 +1428,34 @@ int vp9_decode_frame(VP9Decoder *pbi, // TODO(jzern): remove frame_parallel_decoding_mode restriction for // single-frame tile decoding. - if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 && + if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 && cm->frame_parallel_decoding_mode) { *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + // If multiple threads are used to decode tiles, then we use those threads + // to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); } new_fb->corrupted |= xd->corrupted; - if (!pbi->decoded_key_frame) { - if (keyframe && !new_fb->corrupted) - pbi->decoded_key_frame = 1; - else - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "A stream must start with a complete key frame"); - } - - if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { - vp9_adapt_coef_probs(cm); + if (!new_fb->corrupted) { + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(cm); - if (!frame_is_intra_only(cm)) { - vp9_adapt_mode_probs(cm); - vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + } else { + debug_check_frame_counts(cm); } } else { - debug_check_frame_counts(cm); + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); } if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = cm->fc; - - return 0; } diff --git a/libvpx/vp9/decoder/vp9_decodeframe.h b/libvpx/vp9/decoder/vp9_decodeframe.h index 8a19dafc5..10a9e3462 100644 --- a/libvpx/vp9/decoder/vp9_decodeframe.h +++ b/libvpx/vp9/decoder/vp9_decodeframe.h @@ -18,12 +18,18 @@ extern "C" { struct VP9Common; struct VP9Decoder; +struct vp9_read_bit_buffer; void vp9_init_dequantizer(struct VP9Common *cm); -int vp9_decode_frame(struct VP9Decoder *pbi, - const uint8_t *data, const uint8_t *data_end, - const uint8_t **p_data_end); +void vp9_decode_frame(struct VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end); + +int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb); +void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, + int *width, int *height); +BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb); #ifdef __cplusplus } // extern "C" diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index 3618f12d0..32e80f93b 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -23,30 +23,29 @@ #include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_reader.h" -static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { - return (MB_PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p); +static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { + return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p); } -static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, +static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, int size_group) { - const MB_PREDICTION_MODE y_mode = read_intra_mode(r, - cm->fc.y_mode_prob[size_group]); + const PREDICTION_MODE y_mode = + read_intra_mode(r, cm->fc.y_mode_prob[size_group]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.y_mode[size_group][y_mode]; return y_mode; } -static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, - MB_PREDICTION_MODE y_mode) { - const MB_PREDICTION_MODE uv_mode = read_intra_mode(r, +static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, + PREDICTION_MODE y_mode) { + const PREDICTION_MODE uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[y_mode]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.uv_mode[y_mode][uv_mode]; return uv_mode; } -static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, - int ctx) { +static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) { const int mode = vp9_read_tree(r, vp9_inter_mode_tree, cm->fc.inter_mode_probs[ctx]); if (!cm->frame_parallel_decoding_mode) @@ -362,7 +361,7 @@ static INLINE int is_mv_valid(const MV *mv) { mv->col > MV_LOW && mv->col < MV_UPP; } -static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, +static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode, int_mv mv[2], int_mv ref_mv[2], int_mv nearest_mv[2], int_mv near_mv[2], int is_compound, int allow_hp, vp9_reader *r) { @@ -436,6 +435,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + const int ref_idx = frame - LAST_FRAME; + if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE ) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame], mi_row, mi_col); } @@ -469,7 +473,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; - MB_PREDICTION_MODE b_mode; + PREDICTION_MODE b_mode; int_mv nearest_sub8x8[2], near_sub8x8[2]; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c index fd74478e9..2a2f0f5fa 100644 --- a/libvpx/vp9/decoder/vp9_decoder.c +++ b/libvpx/vp9/decoder/vp9_decoder.c @@ -32,85 +32,16 @@ #include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_dthread.h" -#define WRITE_RECON_BUFFER 0 -#if WRITE_RECON_BUFFER == 1 -static void recon_write_yuv_frame(const char *name, - const YV12_BUFFER_CONFIG *s, - int w, int _h) { - FILE *yuv_file = fopen(name, "ab"); - const uint8_t *src = s->y_buffer; - int h = _h; - - do { - fwrite(src, w, 1, yuv_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = (_h + 1) >> 1; - w = (w + 1) >> 1; - - do { - fwrite(src, w, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = (_h + 1) >> 1; - - do { - fwrite(src, w, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - fclose(yuv_file); -} -#endif -#if WRITE_RECON_BUFFER == 2 -void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - // write the frame - FILE *yframe; - int i; - char filename[255]; - - snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->y_height; i++) - fwrite(frame->y_buffer + i * frame->y_stride, - frame->y_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->u_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->v_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); -} -#endif - -void vp9_initialize_dec() { +static void initialize_dec() { static int init_done = 0; if (!init_done) { vp9_init_neighbors(); - vp9_init_quant_tables(); init_done = 1; } } -VP9Decoder *vp9_decoder_create(const VP9D_CONFIG *oxcf) { +VP9Decoder *vp9_decoder_create() { VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi)); VP9_COMMON *const cm = pbi ? &pbi->common : NULL; @@ -126,7 +57,7 @@ VP9Decoder *vp9_decoder_create(const VP9D_CONFIG *oxcf) { } cm->error.setjmp = 1; - vp9_initialize_dec(); + initialize_dec(); vp9_rtcd(); @@ -134,9 +65,7 @@ VP9Decoder *vp9_decoder_create(const VP9D_CONFIG *oxcf) { vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); cm->current_video_frame = 0; - pbi->oxcf = *oxcf; pbi->ready_for_new_data = 1; - pbi->decoded_key_frame = 0; // vp9_init_dequantizer() is first called here. Add check in // frame_init_dequantizer() to avoid unnecessary calling of @@ -147,7 +76,7 @@ VP9Decoder *vp9_decoder_create(const VP9D_CONFIG *oxcf) { cm->error.setjmp = 0; - vp9_worker_init(&pbi->lf_worker); + vp9_get_worker_interface()->init(&pbi->lf_worker); return pbi; } @@ -156,12 +85,12 @@ void vp9_decoder_remove(VP9Decoder *pbi) { VP9_COMMON *const cm = &pbi->common; int i; - vp9_remove_common(cm); - vp9_worker_end(&pbi->lf_worker); + vp9_get_worker_interface()->end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); + vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; - vp9_worker_end(worker); + vp9_get_worker_interface()->end(worker); vpx_free(worker->data1); vpx_free(worker->data2); } @@ -173,6 +102,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows); } + vp9_remove_common(cm); vpx_free(pbi); } @@ -251,17 +181,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, return cm->error.error_code; } - -int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) { - VP9_COMMON *cm = &pbi->common; - - if (index < 0 || index >= REF_FRAMES) - return -1; - - *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf; - return 0; -} - /* If any buffer updating is signaled it should be done here. */ static void swap_frame_buffers(VP9Decoder *pbi) { int ref_index = 0, mask; @@ -288,8 +207,7 @@ static void swap_frame_buffers(VP9Decoder *pbi) { } int vp9_receive_compressed_data(VP9Decoder *pbi, - size_t size, const uint8_t **psource, - int64_t time_stamp) { + size_t size, const uint8_t **psource) { VP9_COMMON *const cm = &pbi->common; const uint8_t *source = *psource; int retcode = 0; @@ -317,6 +235,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; + vp9_clear_system_state(); // We do not know if the missing frame(s) was supposed to update // any of the reference buffers, but we act conservative and @@ -325,10 +244,10 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, // TODO(jkoleszar): Error concealment is undefined and non-normative // at this point, but if it becomes so, [0] may not always be the correct // thing to do here. - if (cm->frame_refs[0].idx != INT_MAX) + if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL) cm->frame_refs[0].buf->corrupted = 1; - if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0) + if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0) cm->frame_bufs[cm->new_fb_idx].ref_count--; return -1; @@ -336,52 +255,10 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, cm->error.setjmp = 1; - retcode = vp9_decode_frame(pbi, source, source + size, psource); - - if (retcode < 0) { - cm->error.error_code = VPX_CODEC_ERROR; - cm->error.setjmp = 0; - if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0) - cm->frame_bufs[cm->new_fb_idx].ref_count--; - return retcode; - } + vp9_decode_frame(pbi, source, source + size, psource); swap_frame_buffers(pbi); -#if WRITE_RECON_BUFFER == 2 - if (cm->show_frame) - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame); - else - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 1000); -#endif - - if (!pbi->do_loopfilter_inline) { - // If multiple threads are used to decode tiles, then we use those threads - // to do parallel loopfiltering. - if (pbi->num_tile_workers) { - vp9_loop_filter_frame_mt(pbi, cm, &pbi->mb, cm->lf.filter_level, 0, 0); - } else { - vp9_loop_filter_frame(cm, &pbi->mb, cm->lf.filter_level, 0, 0); - } - } - -#if WRITE_RECON_BUFFER == 2 - if (cm->show_frame) - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 2000); - else - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 3000); -#endif - -#if WRITE_RECON_BUFFER == 1 - if (cm->show_frame) - recon_write_yuv_frame("recon.yuv", cm->frame_to_show, - cm->width, cm->height); -#endif - vp9_clear_system_state(); cm->last_width = cm->width; @@ -397,37 +274,38 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, } pbi->ready_for_new_data = 0; - pbi->last_time_stamp = time_stamp; cm->error.setjmp = 0; return retcode; } int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, vp9_ppflags_t *flags) { + VP9_COMMON *const cm = &pbi->common; int ret = -1; +#if !CONFIG_VP9_POSTPROC + (void)*flags; +#endif if (pbi->ready_for_new_data == 1) return ret; - /* ie no raw frame to show!!! */ - if (pbi->common.show_frame == 0) + /* no raw frame to show!!! */ + if (!cm->show_frame) return ret; pbi->ready_for_new_data = 1; - *time_stamp = pbi->last_time_stamp; - *time_end_stamp = 0; #if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(&pbi->common, sd, flags); -#else - *sd = *pbi->common.frame_to_show; - sd->y_width = pbi->common.width; - sd->y_height = pbi->common.height; - sd->uv_width = sd->y_width >> pbi->common.subsampling_x; - sd->uv_height = sd->y_height >> pbi->common.subsampling_y; + if (!cm->show_existing_frame) { + ret = vp9_post_proc_frame(cm, sd, flags); + } else { + *sd = *cm->frame_to_show; ret = 0; + } +#else + *sd = *cm->frame_to_show; + ret = 0; #endif /*!CONFIG_POSTPROC*/ vp9_clear_system_state(); return ret; diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h index c9dc25191..223b66fc7 100644 --- a/libvpx/vp9/decoder/vp9_decoder.h +++ b/libvpx/vp9/decoder/vp9_decoder.h @@ -18,58 +18,52 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" +#include "vp9/common/vp9_thread.h" -#include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_thread.h" #ifdef __cplusplus extern "C" { #endif -typedef struct { - int width; - int height; - int version; - int max_threads; - int inv_tile_order; -} VP9D_CONFIG; +// TODO(hkuang): combine this with TileWorkerData. +typedef struct TileData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} TileData; typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); DECLARE_ALIGNED(16, VP9_COMMON, common); - VP9D_CONFIG oxcf; - - int64_t last_time_stamp; int ready_for_new_data; int refresh_frame_flags; - int decoded_key_frame; - - int initial_width; - int initial_height; + int frame_parallel_decode; // frame-based threading. - int do_loopfilter_inline; // apply loopfilter to available rows immediately VP9Worker lf_worker; - VP9Worker *tile_workers; int num_tile_workers; + TileData *tile_data; + int total_tiles; + VP9LfSync lf_row_sync; -} VP9Decoder; -void vp9_initialize_dec(); + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + + int max_threads; + int inv_tile_order; +} VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, - size_t size, const uint8_t **dest, - int64_t time_stamp); + size_t size, const uint8_t **dest); -int vp9_get_raw_frame(struct VP9Decoder *pbi, - YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, +int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, vp9_ppflags_t *flags); vpx_codec_err_t vp9_copy_reference_dec(struct VP9Decoder *pbi, @@ -80,11 +74,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp9_get_reference_dec(struct VP9Decoder *pbi, - int index, YV12_BUFFER_CONFIG **fb); - - -struct VP9Decoder *vp9_decoder_create(const VP9D_CONFIG *oxcf); +struct VP9Decoder *vp9_decoder_create(); void vp9_decoder_remove(struct VP9Decoder *pbi); diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c index 860da532a..91cdf3860 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libvpx/vp9/decoder/vp9_detokenize.c @@ -28,35 +28,6 @@ #define CAT_THREE_CONTEXT_NODE 6 #define CAT_FIVE_CONTEXT_NODE 7 -#define CAT1_MIN_VAL 5 -#define CAT2_MIN_VAL 7 -#define CAT3_MIN_VAL 11 -#define CAT4_MIN_VAL 19 -#define CAT5_MIN_VAL 35 -#define CAT6_MIN_VAL 67 -#define CAT1_PROB0 159 -#define CAT2_PROB0 145 -#define CAT2_PROB1 165 - -#define CAT3_PROB0 140 -#define CAT3_PROB1 148 -#define CAT3_PROB2 173 - -#define CAT4_PROB0 135 -#define CAT4_PROB1 140 -#define CAT4_PROB2 155 -#define CAT4_PROB3 176 - -#define CAT5_PROB0 130 -#define CAT5_PROB1 134 -#define CAT5_PROB2 141 -#define CAT5_PROB3 157 -#define CAT5_PROB4 180 - -static const vp9_prob cat6_prob[15] = { - 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 -}; - #define INCREMENT_COUNT(token) \ do { \ if (!cm->frame_parallel_decoding_mode) \ @@ -96,7 +67,6 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, unsigned int (*eob_branch_count)[COEFF_CONTEXTS] = counts->eob_branch[tx_size][type][ref]; uint8_t token_cache[32 * 32]; - const uint8_t *cat6; const uint8_t *band_translate = get_band_translate(tx_size); const int dq_shift = (tx_size == TX_32X32); int v; @@ -148,44 +118,55 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) { if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) { val = CAT1_MIN_VAL; - ADJUST_COEF(CAT1_PROB0, 0); + ADJUST_COEF(vp9_cat1_prob[0], 0); WRITE_COEF_CONTINUE(val, CATEGORY1_TOKEN); } val = CAT2_MIN_VAL; - ADJUST_COEF(CAT2_PROB1, 1); - ADJUST_COEF(CAT2_PROB0, 0); + ADJUST_COEF(vp9_cat2_prob[0], 1); + ADJUST_COEF(vp9_cat2_prob[1], 0); WRITE_COEF_CONTINUE(val, CATEGORY2_TOKEN); } if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) { if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) { val = CAT3_MIN_VAL; - ADJUST_COEF(CAT3_PROB2, 2); - ADJUST_COEF(CAT3_PROB1, 1); - ADJUST_COEF(CAT3_PROB0, 0); + ADJUST_COEF(vp9_cat3_prob[0], 2); + ADJUST_COEF(vp9_cat3_prob[1], 1); + ADJUST_COEF(vp9_cat3_prob[2], 0); WRITE_COEF_CONTINUE(val, CATEGORY3_TOKEN); } val = CAT4_MIN_VAL; - ADJUST_COEF(CAT4_PROB3, 3); - ADJUST_COEF(CAT4_PROB2, 2); - ADJUST_COEF(CAT4_PROB1, 1); - ADJUST_COEF(CAT4_PROB0, 0); + ADJUST_COEF(vp9_cat4_prob[0], 3); + ADJUST_COEF(vp9_cat4_prob[1], 2); + ADJUST_COEF(vp9_cat4_prob[2], 1); + ADJUST_COEF(vp9_cat4_prob[3], 0); WRITE_COEF_CONTINUE(val, CATEGORY4_TOKEN); } if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) { val = CAT5_MIN_VAL; - ADJUST_COEF(CAT5_PROB4, 4); - ADJUST_COEF(CAT5_PROB3, 3); - ADJUST_COEF(CAT5_PROB2, 2); - ADJUST_COEF(CAT5_PROB1, 1); - ADJUST_COEF(CAT5_PROB0, 0); + ADJUST_COEF(vp9_cat5_prob[0], 4); + ADJUST_COEF(vp9_cat5_prob[1], 3); + ADJUST_COEF(vp9_cat5_prob[2], 2); + ADJUST_COEF(vp9_cat5_prob[3], 1); + ADJUST_COEF(vp9_cat5_prob[4], 0); WRITE_COEF_CONTINUE(val, CATEGORY5_TOKEN); } val = 0; - cat6 = cat6_prob; - while (*cat6) - val = (val << 1) | vp9_read(r, *cat6++); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[0]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[1]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[2]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[3]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[4]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[5]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[6]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[7]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[8]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[9]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[10]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[11]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[12]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[13]); val += CAT6_MIN_VAL; WRITE_COEF_CONTINUE(val, CATEGORY6_TOKEN); diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c index e67b37240..c22617edb 100644 --- a/libvpx/vp9/decoder/vp9_dsubexp.c +++ b/libvpx/vp9/decoder/vp9_dsubexp.c @@ -26,22 +26,6 @@ static int decode_uniform(vp9_reader *r) { return v < m ? v : (v << 1) - m + vp9_read_bit(r); } - -static int merge_index(int v, int n, int modulus) { - int max1 = (n - 1 - modulus / 2) / modulus + 1; - if (v < max1) { - v = v * modulus + modulus / 2; - } else { - int w; - v -= max1; - w = v; - v += (v + modulus - modulus / 2) / modulus; - while (v % modulus == modulus / 2 || - w != v - (v + modulus - modulus / 2) / modulus) v++; - } - return v; -} - static int inv_remap_prob(int v, int m) { static int inv_map_table[MAX_PROB - 1] = { 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188, diff --git a/libvpx/vp9/decoder/vp9_dthread.c b/libvpx/vp9/decoder/vp9_dthread.c index 9b124c9d9..5dda49a0f 100644 --- a/libvpx/vp9/decoder/vp9_dthread.c +++ b/libvpx/vp9/decoder/vp9_dthread.c @@ -40,13 +40,13 @@ static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { const int nsync = lf_sync->sync_range; if (r && !(c & (nsync - 1))) { - mutex_lock(&lf_sync->mutex_[r - 1]); + pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; + mutex_lock(mutex); while (c > lf_sync->cur_sb_col[r - 1] - nsync) { - pthread_cond_wait(&lf_sync->cond_[r - 1], - &lf_sync->mutex_[r - 1]); + pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); } - pthread_mutex_unlock(&lf_sync->mutex_[r - 1]); + pthread_mutex_unlock(mutex); } #else (void)lf_sync; @@ -89,29 +89,30 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, // Implement row loopfiltering for each thread. static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, - VP9_COMMON *const cm, MACROBLOCKD *const xd, + VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only, VP9LfSync *const lf_sync, int num_lf_workers) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; int r, c; // SB row and col - LOOP_FILTER_MASK lfm; const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; for (r = start; r < stop; r += num_lf_workers) { const int mi_row = r << MI_BLOCK_SIZE_LOG2; - MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mi_stride; + MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (c = 0; c < sb_cols; ++c) { const int mi_col = c << MI_BLOCK_SIZE_LOG2; + LOOP_FILTER_MASK lfm; int plane; sync_read(lf_sync, r, c); - vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col); - vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mi_stride, &lfm); + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); for (plane = 0; plane < num_planes; ++plane) { - vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); + vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); } sync_write(lf_sync, r, c, sb_cols); @@ -123,8 +124,8 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, static int loop_filter_row_worker(void *arg1, void *arg2) { TileWorkerData *const tile_data = (TileWorkerData*)arg1; LFWorkerData *const lf_data = &tile_data->lfdata; - - loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, &lf_data->xd, + (void) arg2; + loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only, lf_data->lf_sync, lf_data->num_lf_workers); return 1; @@ -132,22 +133,21 @@ static int loop_filter_row_worker(void *arg1, void *arg2) { // VP9 decoder: Implement multi-threaded loopfilter that uses the tile // threads. -void vp9_loop_filter_frame_mt(VP9Decoder *pbi, - VP9_COMMON *cm, - MACROBLOCKD *xd, +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + VP9Decoder *pbi, VP9_COMMON *cm, int frame_filter_level, - int y_only, int partial_frame) { + int y_only) { + VP9LfSync *const lf_sync = &pbi->lf_row_sync; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; const int tile_cols = 1 << cm->log2_tile_cols; - const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); + const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); int i; // Allocate memory used in thread synchronization. // This always needs to be done even if frame_filter_level is 0. if (!cm->current_video_frame || cm->last_height != cm->height) { - VP9LfSync *const lf_sync = &pbi->lf_row_sync; - if (cm->last_height != cm->height) { const int aligned_last_height = ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2); @@ -166,8 +166,7 @@ void vp9_loop_filter_frame_mt(VP9Decoder *pbi, vp9_loop_filter_frame_init(cm, frame_filter_level); // Initialize cur_sb_col to -1 for all SB rows. - vpx_memset(pbi->lf_row_sync.cur_sb_col, -1, - sizeof(*pbi->lf_row_sync.cur_sb_col) * sb_rows); + vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); // Set up loopfilter thread data. // The decoder is using num_workers instead of pbi->num_tile_workers @@ -187,27 +186,27 @@ void vp9_loop_filter_frame_mt(VP9Decoder *pbi, worker->hook = (VP9WorkerHook)loop_filter_row_worker; // Loopfilter data - lf_data->frame_buffer = get_frame_new_buffer(cm); + lf_data->frame_buffer = frame; lf_data->cm = cm; - lf_data->xd = pbi->mb; + vp9_copy(lf_data->planes, pbi->mb.plane); lf_data->start = i; lf_data->stop = sb_rows; lf_data->y_only = y_only; // always do all planes in decoder - lf_data->lf_sync = &pbi->lf_row_sync; + lf_data->lf_sync = lf_sync; lf_data->num_lf_workers = num_workers; // Start loopfiltering if (i == num_workers - 1) { - vp9_worker_execute(worker); + winterface->execute(worker); } else { - vp9_worker_launch(worker); + winterface->launch(worker); } } // Wait till all rows are finished for (i = 0; i < num_workers; ++i) { - vp9_worker_sync(&pbi->tile_workers[i]); + winterface->sync(&pbi->tile_workers[i]); } } @@ -253,8 +252,12 @@ void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, // Deallocate lf synchronization related mutex and data void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { -#if CONFIG_MULTITHREAD +#if !CONFIG_MULTITHREAD + (void)rows; +#endif // !CONFIG_MULTITHREAD + if (lf_sync != NULL) { +#if CONFIG_MULTITHREAD int i; if (lf_sync->mutex_ != NULL) { @@ -269,17 +272,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { } vpx_free(lf_sync->cond_); } - +#endif // CONFIG_MULTITHREAD vpx_free(lf_sync->cur_sb_col); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. - vpx_memset(lf_sync, 0, sizeof(*lf_sync)); + vp9_zero(*lf_sync); } -#else - (void)rows; - if (lf_sync != NULL) { - vpx_free(lf_sync->cur_sb_col); - vpx_memset(lf_sync, 0, sizeof(*lf_sync)); - } -#endif // CONFIG_MULTITHREAD } diff --git a/libvpx/vp9/decoder/vp9_dthread.h b/libvpx/vp9/decoder/vp9_dthread.h index 005bd7bbd..423bd8808 100644 --- a/libvpx/vp9/decoder/vp9_dthread.h +++ b/libvpx/vp9/decoder/vp9_dthread.h @@ -12,11 +12,9 @@ #define VP9_DECODER_VP9_DTHREAD_H_ #include "./vpx_config.h" -#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_thread.h" #include "vp9/decoder/vp9_reader.h" -#include "vp9/decoder/vp9_thread.h" -struct macroblockd; struct VP9Common; struct VP9Decoder; @@ -43,17 +41,17 @@ typedef struct VP9LfSyncData { } VP9LfSync; // Allocate memory for loopfilter row synchronization. -void vp9_loop_filter_alloc(struct VP9Common *cm, struct VP9LfSyncData *lf_sync, +void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync, int rows, int width); // Deallocate loopfilter synchronization related mutex and data. -void vp9_loop_filter_dealloc(struct VP9LfSyncData *lf_sync, int rows); +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows); // Multi-threaded loopfilter that uses the tile threads. -void vp9_loop_filter_frame_mt(struct VP9Decoder *pbi, +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + struct VP9Decoder *pbi, struct VP9Common *cm, - struct macroblockd *xd, int frame_filter_level, - int y_only, int partial_frame); + int y_only); #endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.c b/libvpx/vp9/decoder/vp9_read_bit_buffer.c index 778a635e3..3eef72844 100644 --- a/libvpx/vp9/decoder/vp9_read_bit_buffer.c +++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.c @@ -10,7 +10,7 @@ #include "vp9/decoder/vp9_read_bit_buffer.h" size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) { - return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0); + return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT; } int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) { diff --git a/libvpx/vp9/decoder/vp9_reader.c b/libvpx/vp9/decoder/vp9_reader.c index fb44c8898..6bb4f9f73 100644 --- a/libvpx/vp9/decoder/vp9_reader.c +++ b/libvpx/vp9/decoder/vp9_reader.c @@ -18,7 +18,11 @@ // Even relatively modest values like 100 would work fine. #define LOTS_OF_BITS 0x40000000 -int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) { +int vp9_reader_init(vp9_reader *r, + const uint8_t *buffer, + size_t size, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { if (size && !buffer) { return 1; } else { @@ -27,6 +31,8 @@ int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) { r->value = 0; r->count = -8; r->range = 255; + r->decrypt_cb = decrypt_cb; + r->decrypt_state = decrypt_state; vp9_reader_fill(r); return vp9_read_bit(r) != 0; // marker bit } @@ -35,12 +41,21 @@ int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) { void vp9_reader_fill(vp9_reader *r) { const uint8_t *const buffer_end = r->buffer_end; const uint8_t *buffer = r->buffer; + const uint8_t *buffer_start = buffer; BD_VALUE value = r->value; int count = r->count; int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); int loop_end = 0; - const int bits_left = (int)((buffer_end - buffer) * CHAR_BIT); - const int x = shift + CHAR_BIT - bits_left; + const size_t bytes_left = buffer_end - buffer; + const size_t bits_left = bytes_left * CHAR_BIT; + const int x = (int)(shift + CHAR_BIT - bits_left); + + if (r->decrypt_cb) { + size_t n = MIN(sizeof(r->clear_buffer), bytes_left); + r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); + buffer = r->clear_buffer; + buffer_start = r->clear_buffer; + } if (x >= 0) { count += LOTS_OF_BITS; @@ -55,7 +70,10 @@ void vp9_reader_fill(vp9_reader *r) { } } - r->buffer = buffer; + // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, + // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to 'r->buffer'. + r->buffer += buffer - buffer_start; r->value = value; r->count = count; } diff --git a/libvpx/vp9/decoder/vp9_reader.h b/libvpx/vp9/decoder/vp9_reader.h index 8fe6acbc2..2d9eccfbf 100644 --- a/libvpx/vp9/decoder/vp9_reader.h +++ b/libvpx/vp9/decoder/vp9_reader.h @@ -16,6 +16,7 @@ #include "./vpx_config.h" #include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_prob.h" @@ -31,12 +32,19 @@ typedef size_t BD_VALUE; typedef struct { const uint8_t *buffer_end; const uint8_t *buffer; + uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; BD_VALUE value; int count; unsigned int range; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; } vp9_reader; -int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size); +int vp9_reader_init(vp9_reader *r, + const uint8_t *buffer, + size_t size, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); void vp9_reader_fill(vp9_reader *r); @@ -44,7 +52,7 @@ int vp9_reader_has_error(vp9_reader *r); const uint8_t *vp9_reader_find_end(vp9_reader *r); -static int vp9_read(vp9_reader *r, int prob) { +static INLINE int vp9_read(vp9_reader *r, int prob) { unsigned int bit = 0; BD_VALUE value; BD_VALUE bigsplit; @@ -81,11 +89,11 @@ static int vp9_read(vp9_reader *r, int prob) { return bit; } -static int vp9_read_bit(vp9_reader *r) { +static INLINE int vp9_read_bit(vp9_reader *r) { return vp9_read(r, 128); // vp9_prob_half } -static int vp9_read_literal(vp9_reader *r, int bits) { +static INLINE int vp9_read_literal(vp9_reader *r, int bits) { int literal = 0, bit; for (bit = bits - 1; bit >= 0; bit--) @@ -94,8 +102,8 @@ static int vp9_read_literal(vp9_reader *r, int bits) { return literal; } -static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree, - const vp9_prob *probs) { +static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree, + const vp9_prob *probs) { vp9_tree_index i = 0; while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0) diff --git a/libvpx/vp9/decoder/vp9_thread.c b/libvpx/vp9/decoder/vp9_thread.c deleted file mode 100644 index 5d31d3d98..000000000 --- a/libvpx/vp9/decoder/vp9_thread.c +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// Multi-threaded worker -// -// Original source: -// http://git.chromium.org/webm/libwebp.git -// 100644 blob eff8f2a8c20095aade3c292b0e9292dac6cb3587 src/utils/thread.c - - -#include <assert.h> -#include <string.h> // for memset() -#include "./vp9_thread.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#if CONFIG_MULTITHREAD - -//------------------------------------------------------------------------------ - -static THREADFN thread_loop(void *ptr) { // thread loop - VP9Worker* const worker = (VP9Worker*)ptr; - int done = 0; - while (!done) { - pthread_mutex_lock(&worker->mutex_); - while (worker->status_ == OK) { // wait in idling mode - pthread_cond_wait(&worker->condition_, &worker->mutex_); - } - if (worker->status_ == WORK) { - vp9_worker_execute(worker); - worker->status_ = OK; - } else if (worker->status_ == NOT_OK) { // finish the worker - done = 1; - } - // signal to the main thread that we're done (for Sync()) - pthread_cond_signal(&worker->condition_); - pthread_mutex_unlock(&worker->mutex_); - } - return THREAD_RETURN(NULL); // Thread is finished -} - -// main thread state control -static void change_state(VP9Worker* const worker, - VP9WorkerStatus new_status) { - // no-op when attempting to change state on a thread that didn't come up - if (worker->status_ < OK) return; - - pthread_mutex_lock(&worker->mutex_); - // wait for the worker to finish - while (worker->status_ != OK) { - pthread_cond_wait(&worker->condition_, &worker->mutex_); - } - // assign new status and release the working thread if needed - if (new_status != OK) { - worker->status_ = new_status; - pthread_cond_signal(&worker->condition_); - } - pthread_mutex_unlock(&worker->mutex_); -} - -#endif // CONFIG_MULTITHREAD - -//------------------------------------------------------------------------------ - -void vp9_worker_init(VP9Worker* const worker) { - memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; -} - -int vp9_worker_sync(VP9Worker* const worker) { -#if CONFIG_MULTITHREAD - change_state(worker, OK); -#endif - assert(worker->status_ <= OK); - return !worker->had_error; -} - -int vp9_worker_reset(VP9Worker* const worker) { - int ok = 1; - worker->had_error = 0; - if (worker->status_ < OK) { -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&worker->mutex_, NULL) || - pthread_cond_init(&worker->condition_, NULL)) { - return 0; - } - pthread_mutex_lock(&worker->mutex_); - ok = !pthread_create(&worker->thread_, NULL, thread_loop, worker); - if (ok) worker->status_ = OK; - pthread_mutex_unlock(&worker->mutex_); -#else - worker->status_ = OK; -#endif - } else if (worker->status_ > OK) { - ok = vp9_worker_sync(worker); - } - assert(!ok || (worker->status_ == OK)); - return ok; -} - -void vp9_worker_execute(VP9Worker* const worker) { - if (worker->hook != NULL) { - worker->had_error |= !worker->hook(worker->data1, worker->data2); - } -} - -void vp9_worker_launch(VP9Worker* const worker) { -#if CONFIG_MULTITHREAD - change_state(worker, WORK); -#else - vp9_worker_execute(worker); -#endif -} - -void vp9_worker_end(VP9Worker* const worker) { - if (worker->status_ >= OK) { -#if CONFIG_MULTITHREAD - change_state(worker, NOT_OK); - pthread_join(worker->thread_, NULL); - pthread_mutex_destroy(&worker->mutex_); - pthread_cond_destroy(&worker->condition_); -#else - worker->status_ = NOT_OK; -#endif - } - assert(worker->status_ == NOT_OK); -} - -//------------------------------------------------------------------------------ - -#if defined(__cplusplus) || defined(c_plusplus) -} // extern "C" -#endif diff --git a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 000000000..6c66f5d5b --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" + +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} + +void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), + vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), + vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), + vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), + vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from vp9_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c new file mode 100644 index 000000000..2d5ec79b3 --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include <math.h> + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)scan; + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + + for (i = 0; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round); + const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), + vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), + vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), + vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = + vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + + vst1q_s16(&qcoeff_ptr[i], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + { + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } + } else { + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + *eob_ptr = 0; + } +} diff --git a/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c new file mode 100644 index 000000000..c4cd85680 --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), + vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), + vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + } + return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref = vld1q_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); +} diff --git a/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c new file mode 100644 index 000000000..b4bf567db --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +void vp9_subtract_block_neon(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), + vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), + vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), + vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), + vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), + vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), + vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) + diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c new file mode 100644 index 000000000..816fbda1f --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +enum { kWidth8 = 8 }; +enum { kHeight8 = 8 }; +enum { kHeight8PlusOne = 9 }; +enum { kWidth16 = 16 }; +enum { kHeight16 = 16 }; +enum { kHeight16PlusOne = 17 }; +enum { kWidth32 = 32 }; +enum { kHeight32 = 32 }; +enum { kHeight32PlusOne = 33 }; +enum { kPixelStepOne = 1 }; +enum { kAlign16 = 16 }; + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8, + kHeight8, sse, sum); +} + +unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); +} + +void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16, + kHeight16, sse, sum); +} + +unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); + + var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, + kWidth8, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); +} + +unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight16PlusOne, kWidth16, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16, + kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); +} + +void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32, + kHeight32, sse, sum); +} + +unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); +} + +unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight32PlusOne, kWidth32, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32, + kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); +} diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.c b/libvpx/vp9/encoder/vp9_aq_complexity.c index 47ad8d8cc..33f92393c 100644 --- a/libvpx/vp9/encoder/vp9_aq_complexity.c +++ b/libvpx/vp9/encoder/vp9_aq_complexity.c @@ -15,8 +15,19 @@ #include "vp9/encoder/vp9_segmentation.h" -static const double in_frame_q_adj_ratio[MAX_SEGMENTS] = - {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; +#define AQ_C_SEGMENTS 3 +#define AQ_C_STRENGTHS 3 +static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3}; +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = + {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = + {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}}; + +static int get_aq_c_strength(int q_index) { + // Approximate base quatizer (truncated to int) + int base_quant = vp9_ac_quant(q_index, 0) / 4; + return (base_quant > 20) + (base_quant > 45); +} void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -29,6 +40,8 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { int segment; + const int aq_strength = get_aq_c_strength(cm->base_qindex); + const int active_segments = aq_c_active_segments[aq_strength]; // Clear down the segment map. vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); @@ -36,9 +49,17 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { // Clear down the complexity map used for rd. vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols); - vp9_enable_segmentation(seg); vp9_clearall_segfeatures(seg); + // Segmentation only makes sense if the target bits per SB is above a + // threshold. Below this the overheads will usually outweigh any benefit. + if (cpi->rc.sb64_target_rate < 256) { + vp9_disable_segmentation(seg); + return; + } + + vp9_enable_segmentation(seg); + // Select delta coding method. seg->abs_delta = SEGMENT_DELTADATA; @@ -46,20 +67,35 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); // Use some of the segments for in frame Q adjustment. - for (segment = 1; segment < 2; segment++) { - const int qindex_delta = + for (segment = 1; segment < active_segments; ++segment) { + int qindex_delta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - in_frame_q_adj_ratio[segment]); - vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); - vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + aq_c_q_adj_factor[aq_strength][segment]); + + // For AQ complexity mode, we dont allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + if ((cm->base_qindex + qindex_delta) > 0) { + vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } } } } -// Select a segment for the current SB64 +// Select a segment for the current SB64 block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average. +// An "aq_strength" value determines how many segments are supported, +// the set of transition points to use and the extent of the quantizer +// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()). void vp9_select_in_frame_q_segment(VP9_COMP *cpi, - int mi_row, int mi_col, - int output_enabled, int projected_rate) { + int mi_row, int mi_col, + int output_enabled, int projected_rate) { VP9_COMMON *const cm = &cpi->common; const int mi_offset = mi_row * cm->mi_cols + mi_col; @@ -79,11 +115,22 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi, // It is converted to bits * 256 units. const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh); - - if (projected_rate < (target_rate / 4)) { - segment = 1; - } else { - segment = 0; + const int aq_strength = get_aq_c_strength(cm->base_qindex); + const int active_segments = aq_c_active_segments[aq_strength]; + + // The number of segments considered and the transition points used to + // select them is determined by the "aq_strength" value. + // Currently this loop only supports segments that reduce Q (i.e. where + // there is undershoot. + // The loop counts down towards segment 0 which is the default segment + // with no Q adjustment. + segment = active_segments - 1; + while (segment > 0) { + if (projected_rate < + (target_rate * aq_c_transitions[aq_strength][segment])) { + break; + } + --segment; } if (target_rate > 0) { diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 787909142..e7f0daac3 100644 --- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -16,7 +16,6 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" struct CYCLIC_REFRESH { @@ -72,7 +71,7 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm, // Turn off cyclic refresh if bits available per frame is not sufficiently // larger than bit cost of segmentation. Segment map bit cost should scale // with number of seg blocks, so compare available bits to number of blocks. - // Average bits available per frame = av_per_frame_bandwidth + // Average bits available per frame = avg_frame_bandwidth // Number of (8x8) blocks in frame = mi_rows * mi_cols; const float factor = 0.5; const int number_blocks = cm->mi_rows * cm->mi_cols; @@ -80,7 +79,7 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm, // ~24kbps for CIF, 72kbps for VGA (at 30fps). // Also turn off at very small frame sizes, to avoid too large fraction of // superblocks to be refreshed per frame. Threshold below is less than QCIF. - if (rc->av_per_frame_bandwidth < factor * number_blocks || + if (rc->avg_frame_bandwidth < factor * number_blocks || number_blocks / 64 < 5) return 0; else @@ -136,7 +135,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, const int xmis = MIN(cm->mi_cols - mi_col, bw); const int ymis = MIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; - const int refresh_this_block = candidate_refresh_aq(cr, mbmi, bsize, use_rd); + const int refresh_this_block = cpi->mb.in_static_area || + candidate_refresh_aq(cr, mbmi, bsize, use_rd); // Default is to not update the refresh map. int new_map_value = cr->map[block_index]; int x = 0; int y = 0; @@ -200,6 +200,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // Rate target ratio to set q delta. const float rate_ratio_qdelta = 2.0; + const double q = vp9_convert_qindex_to_q(cm->base_qindex); vp9_clear_system_state(); // Some of these parameters may be set via codec-control function later. cr->max_sbs_perframe = 10; @@ -209,14 +210,12 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // Set rate threshold to some fraction of target (and scaled by 256). cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2; // Distortion threshold, quadratic in Q, scale factor to be adjusted. - cr->thresh_dist_sb = 8 * (int)(vp9_convert_qindex_to_q(cm->base_qindex) * - vp9_convert_qindex_to_q(cm->base_qindex)); + cr->thresh_dist_sb = 8 * (int)(q * q); if (cpi->sf.use_nonrd_pick_mode) { // May want to be more conservative with thresholds in non-rd mode for now // as rate/distortion are derived from model based on prediction residual. cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3; - cr->thresh_dist_sb = 4 * (int)(vp9_convert_qindex_to_q(cm->base_qindex) * - vp9_convert_qindex_to_q(cm->base_qindex)); + cr->thresh_dist_sb = 4 * (int)(q * q); } cr->num_seg_blocks = 0; diff --git a/libvpx/vp9/encoder/vp9_aq_variance.c b/libvpx/vp9/encoder/vp9_aq_variance.c index ae2a163b1..56db95eb7 100644 --- a/libvpx/vp9/encoder/vp9_aq_variance.c +++ b/libvpx/vp9/encoder/vp9_aq_variance.c @@ -15,7 +15,7 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_systemdependent.h" diff --git a/libvpx/vp9/encoder/vp9_aq_variance.h b/libvpx/vp9/encoder/vp9_aq_variance.h index 381fe50cf..d1a459fe9 100644 --- a/libvpx/vp9/encoder/vp9_aq_variance.h +++ b/libvpx/vp9/encoder/vp9_aq_variance.h @@ -12,7 +12,7 @@ #ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_ #define VP9_ENCODER_VP9_AQ_VARIANCE_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #ifdef __cplusplus extern "C" { diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 8d2afb991..b0ff0fa81 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -20,7 +20,6 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_mvref_common.h" -#include "vp9/common/vp9_pragmas.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_systemdependent.h" @@ -47,12 +46,12 @@ void vp9_entropy_mode_init() { vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree); } -static void write_intra_mode(vp9_writer *w, MB_PREDICTION_MODE mode, +static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode, const vp9_prob *probs) { vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]); } -static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode, +static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode, const vp9_prob *probs) { assert(is_inter_mode(mode)); vp9_write_token(w, vp9_inter_mode_tree, probs, @@ -79,13 +78,13 @@ static void prob_diff_update(const vp9_tree_index *tree, vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]); } -static void write_selected_tx_size(const VP9_COMP *cpi, +static void write_selected_tx_size(const VP9_COMMON *cm, + const MACROBLOCKD *xd, TX_SIZE tx_size, BLOCK_SIZE bsize, vp9_writer *w) { const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd, - &cpi->common.fc.tx_probs); + &cm->fc.tx_probs); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -94,14 +93,13 @@ static void write_selected_tx_size(const VP9_COMP *cpi, } } -static int write_skip(const VP9_COMP *cpi, int segment_id, const MODE_INFO *mi, - vp9_writer *w) { - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { +static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MODE_INFO *mi, vp9_writer *w) { + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int skip = mi->mbmi.skip; - vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd)); + vp9_write(w, skip, vp9_get_skip_prob(cm, xd)); return skip; } } @@ -122,7 +120,7 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) { } static void pack_mb_tokens(vp9_writer *w, - TOKENEXTRA **tp, const TOKENEXTRA *stop) { + TOKENEXTRA **tp, const TOKENEXTRA *const stop) { TOKENEXTRA *p = *tp; while (p < stop && p->token != EOSB_TOKEN) { @@ -189,9 +187,8 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg, } // This function encodes the reference frame -static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) { - const VP9_COMMON *const cm = &cpi->common; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, + vp9_writer *w) { const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; const int is_compound = has_second_ref(mbmi); const int segment_id = mbmi->segment_id; @@ -233,7 +230,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, const MACROBLOCKD *const xd = &x->e_mbd; const struct segmentation *const seg = &cm->seg; const MB_MODE_INFO *const mbmi = &mi->mbmi; - const MB_PREDICTION_MODE mode = mbmi->mode; + const PREDICTION_MODE mode = mbmi->mode; const int segment_id = mbmi->segment_id; const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; @@ -253,7 +250,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, } } - skip = write_skip(cpi, segment_id, mi, w); + skip = write_skip(cm, xd, segment_id, mi, w); if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd)); @@ -261,7 +258,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && !(is_inter && (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { - write_selected_tx_size(cpi, mbmi->tx_size, bsize, w); + write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w); } if (!is_inter) { @@ -273,7 +270,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { - const MB_PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; + const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]); } } @@ -282,7 +279,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, } else { const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx]; - write_ref_frames(cpi, w); + write_ref_frames(cm, xd, w); // If segment skip is not enabled code the mode. if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { @@ -308,7 +305,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { const int j = idy * 2 + idx; - const MB_PREDICTION_MODE b_mode = mi->bmi[j].as_mode; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; write_inter_mode(w, b_mode, inter_probs); ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; if (b_mode == NEWMV) { @@ -330,10 +327,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, } } -static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, - vp9_writer *w) { - const VP9_COMMON *const cm = &cpi->common; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO **mi_8x8, vp9_writer *w) { const struct segmentation *const seg = &cm->seg; const MODE_INFO *const mi = mi_8x8[0]; const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride]; @@ -344,10 +339,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, if (seg->update_map) write_segment_id(w, seg, mbmi->segment_id); - write_skip(cpi, mbmi->segment_id, mi, w); + write_skip(cm, xd, mbmi->segment_id, mi, w); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) - write_selected_tx_size(cpi, mbmi->tx_size, bsize, w); + write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w); if (bsize >= BLOCK_8X8) { write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0)); @@ -369,9 +364,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, } static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + vp9_writer *w, TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; MODE_INFO *m; @@ -383,7 +379,7 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cpi, xd->mi, w); + write_mb_modes_kf(cm, xd, xd->mi, w); } else { pack_inter_mode_mvs(cpi, m, w); } @@ -392,7 +388,8 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, pack_mb_tokens(w, tok, tok_end); } -static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd, +static void write_partition(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, int mi_row, int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) { const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -414,17 +411,17 @@ static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd, } static void write_modes_sb(VP9_COMP *cpi, - const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + const TileInfo *const tile, vp9_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; const int bsl = b_width_log2(bsize); const int bs = (1 << bsl) / 4; PARTITION_TYPE partition; BLOCK_SIZE subsize; - MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + const MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -470,8 +467,8 @@ static void write_modes_sb(VP9_COMP *cpi, } static void write_modes(VP9_COMP *cpi, - const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { + const TileInfo *const tile, vp9_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) { int mi_row, mi_col; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; @@ -485,8 +482,8 @@ static void write_modes(VP9_COMP *cpi, } static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size, - vp9_coeff_stats *coef_branch_ct) { - vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size]; + vp9_coeff_stats *coef_branch_ct, + vp9_coeff_probs_model *coef_probs) { vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size]; unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = cpi->common.counts.eob_branch[tx_size]; @@ -513,16 +510,15 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size, static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, TX_SIZE tx_size, - vp9_coeff_stats *frame_branch_ct) { - vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size]; - vp9_coeff_probs_model *old_frame_coef_probs = - cpi->common.fc.coef_probs[tx_size]; + vp9_coeff_stats *frame_branch_ct, + vp9_coeff_probs_model *new_coef_probs) { + vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size]; const vp9_prob upd = DIFF_UPDATE_PROB; const int entropy_nodes_update = UNCONSTRAINED_NODES; int i, j, k, l, t; switch (cpi->sf.use_fast_coef_updates) { case TWO_LOOP: { - /* dry run to see if there is any udpate at all needed */ + /* dry run to see if there is any update at all needed */ int savings = 0; int update[2] = {0, 0}; for (i = 0; i < PLANE_TYPES; ++i) { @@ -530,14 +526,14 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, for (k = 0; k < COEF_BANDS; ++k) { for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { for (t = 0; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t]; + vp9_prob newp = new_coef_probs[i][j][k][l][t]; + const vp9_prob oldp = old_coef_probs[i][j][k][l][t]; int s; int u = 0; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd); + old_coef_probs[i][j][k][l], &newp, upd); else s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); @@ -567,15 +563,15 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { // calc probs and branch cts for this frame only for (t = 0; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + vp9_prob newp = new_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_coef_probs[i][j][k][l] + t; const vp9_prob upd = DIFF_UPDATE_PROB; int s; int u = 0; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd); + old_coef_probs[i][j][k][l], &newp, upd); else s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], @@ -612,8 +608,8 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { // calc probs and branch cts for this frame only for (t = 0; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + vp9_prob newp = new_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_coef_probs[i][j][k][l] + t; int s; int u = 0; if (l >= prev_coef_contexts_to_update || @@ -623,7 +619,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd); + old_coef_probs[i][j][k][l], &newp, upd); else s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], @@ -670,14 +666,17 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) { const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; TX_SIZE tx_size; vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES]; + vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES]; vp9_clear_system_state(); for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size) - build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size]); + build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size], + frame_coef_probs[tx_size]); for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size]); + update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size], + frame_coef_probs[tx_size]); } static void encode_loopfilter(struct loopfilter *lf, @@ -730,7 +729,7 @@ static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) { } } -static void encode_quantization(VP9_COMMON *cm, +static void encode_quantization(const VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) { vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS); write_delta_q(wb, cm->y_dc_delta_q); @@ -738,12 +737,11 @@ static void encode_quantization(VP9_COMMON *cm, write_delta_q(wb, cm->uv_ac_delta_q); } - -static void encode_segmentation(VP9_COMP *cpi, +static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd, struct vp9_write_bit_buffer *wb) { int i, j; - struct segmentation *seg = &cpi->common.seg; + const struct segmentation *seg = &cm->seg; vp9_wb_write_bit(wb, seg->enabled); if (!seg->enabled) @@ -753,7 +751,7 @@ static void encode_segmentation(VP9_COMP *cpi, vp9_wb_write_bit(wb, seg->update_map); if (seg->update_map) { // Select the coding strategy (temporal or spatial) - vp9_choose_segmap_coding_method(cpi); + vp9_choose_segmap_coding_method(cm, xd); // Write out probabilities used to decode unpredicted macro-block segments for (i = 0; i < SEG_TREE_PROBS; i++) { const int prob = seg->tree_probs[i]; @@ -801,7 +799,6 @@ static void encode_segmentation(VP9_COMP *cpi, } } - static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) { // Mode vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2); @@ -870,7 +867,8 @@ static void fix_interp_filter(VP9_COMMON *cm) { } } -static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { +static void write_tile_info(const VP9_COMMON *const cm, + struct vp9_write_bit_buffer *wb) { int min_log2_tile_cols, max_log2_tile_cols, ones; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); @@ -889,39 +887,29 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { } static int get_refresh_mask(VP9_COMP *cpi) { - // Should the GF or ARF be updated using the transmitted frame or buffer -#if CONFIG_MULTIPLE_ARF - if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame && - !cpi->refresh_alt_ref_frame) { -#else - if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame && - !cpi->use_svc) { -#endif - // Preserve the previously existing golden frame and update the frame in - // the alt ref slot instead. This is highly specific to the use of - // alt-ref as a forward reference, and this needs to be generalized as - // other uses are implemented (like RTC/temporal scaling) - // - // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but - // that happens in vp9_onyx_if.c:update_reference_frames() so that it can - // be done outside of the recode loop. - return (cpi->refresh_last_frame << cpi->lst_fb_idx) | - (cpi->refresh_golden_frame << cpi->alt_fb_idx); - } else { - int arf_idx = cpi->alt_fb_idx; -#if CONFIG_MULTIPLE_ARF - // Determine which ARF buffer to use to encode this ARF frame. - if (cpi->multi_arf_enabled) { - int sn = cpi->sequence_number; - arf_idx = (cpi->frame_coding_order[sn] < 0) ? - cpi->arf_buffer_idx[sn + 1] : - cpi->arf_buffer_idx[sn]; - } -#endif - return (cpi->refresh_last_frame << cpi->lst_fb_idx) | - (cpi->refresh_golden_frame << cpi->gld_fb_idx) | - (cpi->refresh_alt_ref_frame << arf_idx); + if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term we leave it in the GF slot and, + // if we're updating the GF with the current decoded frame, we save it + // instead to the ARF slot. + // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we + // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it + // there so that it can be done outside of the recode loop. + // Note: This is highly specific to the use of ARF as a forward reference, + // and this needs to be generalized as other uses are implemented + // (like RTC/temporal scalability). + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { + int arf_idx = cpi->alt_fb_idx; + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; } + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << arf_idx); + } } static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { @@ -1006,9 +994,10 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, found = cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height; - // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it - // in a better way. - if (cpi->use_svc) { + // Set "found" to 0 for temporal svc and for spatial svc key frame + if (cpi->use_svc && + (cpi->svc.number_spatial_layers == 1 || + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) { found = 0; } vp9_wb_write_bit(wb, found); @@ -1033,9 +1022,45 @@ static void write_sync_code(struct vp9_write_bit_buffer *wb) { static void write_profile(BITSTREAM_PROFILE profile, struct vp9_write_bit_buffer *wb) { - assert(profile < MAX_PROFILES); - vp9_wb_write_bit(wb, profile & 1); - vp9_wb_write_bit(wb, profile >> 1); + switch (profile) { + case PROFILE_0: + vp9_wb_write_literal(wb, 0, 2); + break; + case PROFILE_1: + vp9_wb_write_literal(wb, 2, 2); + break; + case PROFILE_2: + vp9_wb_write_literal(wb, 1, 2); + break; + case PROFILE_3: + vp9_wb_write_literal(wb, 6, 3); + break; + default: + assert(0); + } +} + +static void write_bitdepth_colorspace_sampling( + VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) { + if (cm->profile >= PROFILE_2) { + assert(cm->bit_depth > BITS_8); + vp9_wb_write_bit(wb, cm->bit_depth - BITS_10); + } + vp9_wb_write_literal(wb, cm->color_space, 3); + if (cm->color_space != SRGB) { + vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); + vp9_wb_write_bit(wb, cm->subsampling_x); + vp9_wb_write_bit(wb, cm->subsampling_y); + vp9_wb_write_bit(wb, 0); // unused + } else { + assert(cm->subsampling_x == 1 && cm->subsampling_y == 1); + } + } else { + assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3); + vp9_wb_write_bit(wb, 0); // unused + } } static void write_uncompressed_header(VP9_COMP *cpi, @@ -1052,25 +1077,8 @@ static void write_uncompressed_header(VP9_COMP *cpi, vp9_wb_write_bit(wb, cm->error_resilient_mode); if (cm->frame_type == KEY_FRAME) { - const COLOR_SPACE cs = UNKNOWN; write_sync_code(wb); - if (cm->profile > PROFILE_1) { - assert(cm->bit_depth > BITS_8); - vp9_wb_write_bit(wb, cm->bit_depth - BITS_10); - } - vp9_wb_write_literal(wb, cs, 3); - if (cs != SRGB) { - vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] - if (cm->profile >= PROFILE_1) { - vp9_wb_write_bit(wb, cm->subsampling_x); - vp9_wb_write_bit(wb, cm->subsampling_y); - vp9_wb_write_bit(wb, 0); // has extra plane - } - } else { - assert(cm->profile == PROFILE_1); - vp9_wb_write_bit(wb, 0); // has extra plane - } - + write_bitdepth_colorspace_sampling(cm, wb); write_frame_size(cm, wb); } else { if (!cm->show_frame) @@ -1082,6 +1090,11 @@ static void write_uncompressed_header(VP9_COMP *cpi, if (cm->intra_only) { write_sync_code(wb); + // Note for profile 0, 420 8bpp is assumed. + if (cm->profile > PROFILE_0) { + write_bitdepth_colorspace_sampling(cm, wb); + } + vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); write_frame_size(cm, wb); } else { @@ -1111,7 +1124,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, encode_loopfilter(&cm->lf, wb); encode_quantization(cm, wb); - encode_segmentation(cpi, wb); + encode_segmentation(cm, &cpi->mb.e_mbd, wb); write_tile_info(cm, wb); } @@ -1203,11 +1216,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { saved_wb = wb; vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size - uncompressed_hdr_size = vp9_rb_bytes_written(&wb); + uncompressed_hdr_size = vp9_wb_bytes_written(&wb); data += uncompressed_hdr_size; - vp9_compute_update_table(); - vp9_clear_system_state(); first_part_size = write_compressed_header(cpi, data); @@ -1219,4 +1230,3 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { *size = data - dest; } - diff --git a/libvpx/vp9/encoder/vp9_bitstream.h b/libvpx/vp9/encoder/vp9_bitstream.h index ddfd0ed4f..8e82d1c97 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.h +++ b/libvpx/vp9/encoder/vp9_bitstream.h @@ -16,11 +16,21 @@ extern "C" { #endif -struct VP9_COMP; +#include "vp9/encoder/vp9_encoder.h" void vp9_entropy_mode_init(); -void vp9_pack_bitstream(struct VP9_COMP *cpi, uint8_t *dest, size_t *size); +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); + +static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { + return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && + cpi->rc.is_src_frame_alt_ref && + (!cpi->use_svc || // Add spatial svc base layer case here + (is_spatial_svc(cpi) && + cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].gold_ref_idx >=0 && + cpi->oxcf.ss_play_alternate[0])); +} #ifdef __cplusplus } // extern "C" diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 7729d84b3..bd3b0fdc8 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -20,48 +20,11 @@ extern "C" { #endif -// motion search site typedef struct { - MV mv; - int offset; -} search_site; - -// Structure to hold snapshot of coding context during the mode picking process -typedef struct { - MODE_INFO mic; - uint8_t *zcoeff_blk; - int16_t *coeff[MAX_MB_PLANE][3]; - int16_t *qcoeff[MAX_MB_PLANE][3]; - int16_t *dqcoeff[MAX_MB_PLANE][3]; - uint16_t *eobs[MAX_MB_PLANE][3]; - - // dual buffer pointers, 0: in use, 1: best in store - int16_t *coeff_pbuf[MAX_MB_PLANE][3]; - int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; - int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; - uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; - - int is_coded; - int num_4x4_blk; - int skip; - int_mv best_ref_mv[2]; - int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int rate; - int distortion; - int best_mode_index; - int rddiv; - int rdmult; - int hybrid_pred_diff; - int comp_pred_diff; - int single_pred_diff; - int64_t tx_rd_diff[TX_MODES]; - int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - - // motion vector cache for adaptive motion search control in partition - // search loop - int_mv pred_mv[MAX_REF_FRAMES]; - INTERP_FILTER pred_interp_filter; -} PICK_MODE_CONTEXT; + unsigned int sse; + int sum; + unsigned int var; +} diff; struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); @@ -71,11 +34,14 @@ struct macroblock_plane { struct buf_2d src; // Quantizer setings + int16_t *quant_fp; + int16_t *round_fp; int16_t *quant; int16_t *quant_shift; int16_t *zbin; int16_t *round; + int64_t quant_thred[2]; // Zbin Over Quant value int16_t zbin_extra; }; @@ -91,24 +57,17 @@ struct macroblock { MACROBLOCKD e_mbd; int skip_block; - int select_txfm_size; + int select_tx_size; int skip_recode; int skip_optimize; int q_index; - search_site *ss; - int ss_count; - int searches_per_step; - int errorperbit; int sadperbit16; int sadperbit4; int rddiv; int rdmult; - unsigned int mb_energy; - unsigned int *mb_activity_ptr; - int *mb_norm_activity_ptr; - signed int act_zbin_adj; + int mb_energy; int mv_best_ref_index[MAX_REF_FRAMES]; unsigned int max_mv_context[MAX_REF_FRAMES]; @@ -130,17 +89,6 @@ struct macroblock { int *nmvsadcost_hp[2]; int **mvsadcost; - int mbmode_cost[INTRA_MODES]; - unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; - int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES]; - int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; - int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; - - unsigned char sb_index; // index of 32x32 block inside the 64x64 block - unsigned char mb_index; // index of 16x16 block inside the 32x32 block - unsigned char b_index; // index of 8x8 block inside the 16x16 block - unsigned char ab_index; // index of 4x4 block inside the 8x8 block - // These define limits to motion vector components to prevent them // from extending outside the UMV borders int mv_col_min; @@ -153,84 +101,32 @@ struct macroblock { int encode_breakout; - int in_active_map; - // note that token_costs is the cost when eob node is skipped vp9_coeff_cost token_costs[TX_SIZES]; + int in_static_area; + int optimize; // indicate if it is in the rd search loop or encoding process int use_lp32x32fdct; int skip_encode; + // use fast quantization process + int quant_fp; + + // skip forward transform and quantization + int skip_txfm[MAX_MB_PLANE]; + + int64_t bsse[MAX_MB_PLANE]; + // Used to store sub partition's choices. - int_mv pred_mv[MAX_REF_FRAMES]; - - // TODO(jingning): Need to refactor the structure arrays that buffers the - // coding mode decisions of each partition type. - PICK_MODE_CONTEXT ab4x4_context[4][4][4]; - PICK_MODE_CONTEXT sb8x4_context[4][4][4]; - PICK_MODE_CONTEXT sb4x8_context[4][4][4]; - PICK_MODE_CONTEXT sb8x8_context[4][4][4]; - PICK_MODE_CONTEXT sb8x16_context[4][4][2]; - PICK_MODE_CONTEXT sb16x8_context[4][4][2]; - PICK_MODE_CONTEXT mb_context[4][4]; - PICK_MODE_CONTEXT sb32x16_context[4][2]; - PICK_MODE_CONTEXT sb16x32_context[4][2]; - // when 4 MBs share coding parameters: - PICK_MODE_CONTEXT sb32_context[4]; - PICK_MODE_CONTEXT sb32x64_context[2]; - PICK_MODE_CONTEXT sb64x32_context[2]; - PICK_MODE_CONTEXT sb64_context; - int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; - - BLOCK_SIZE b_partitioning[4][4][4]; - BLOCK_SIZE mb_partitioning[4][4]; - BLOCK_SIZE sb_partitioning[4]; - BLOCK_SIZE sb64_partitioning; + MV pred_mv[MAX_REF_FRAMES]; void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); + void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); }; -// TODO(jingning): the variables used here are little complicated. need further -// refactoring on organizing the temporary buffers, when recursive -// partition down to 4x4 block size is enabled. -static INLINE PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, - BLOCK_SIZE bsize) { - switch (bsize) { - case BLOCK_64X64: - return &x->sb64_context; - case BLOCK_64X32: - return &x->sb64x32_context[x->sb_index]; - case BLOCK_32X64: - return &x->sb32x64_context[x->sb_index]; - case BLOCK_32X32: - return &x->sb32_context[x->sb_index]; - case BLOCK_32X16: - return &x->sb32x16_context[x->sb_index][x->mb_index]; - case BLOCK_16X32: - return &x->sb16x32_context[x->sb_index][x->mb_index]; - case BLOCK_16X16: - return &x->mb_context[x->sb_index][x->mb_index]; - case BLOCK_16X8: - return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_8X16: - return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_8X8: - return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_8X4: - return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_4X8: - return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_4X4: - return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index]; - default: - assert(0); - return NULL; - } -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_context_tree.c b/libvpx/vp9/encoder/vp9_context_tree.c new file mode 100644 index 000000000..9b7a93267 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_context_tree.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encoder.h" + +static const BLOCK_SIZE square[] = { + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, +}; + +static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, + PICK_MODE_CONTEXT *ctx) { + const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk); + const int num_pix = num_blk << 4; + int i, k; + ctx->num_4x4_blk = num_blk; + + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + vpx_memalign(16, num_pix * sizeof(uint16_t))); + ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; + ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; + ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; + ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; + } + } +} + +static void free_mode_context(PICK_MODE_CONTEXT *ctx) { + int i, k; + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + vpx_free(ctx->coeff[i][k]); + ctx->coeff[i][k] = 0; + vpx_free(ctx->qcoeff[i][k]); + ctx->qcoeff[i][k] = 0; + vpx_free(ctx->dqcoeff[i][k]); + ctx->dqcoeff[i][k] = 0; + vpx_free(ctx->eobs[i][k]); + ctx->eobs[i][k] = 0; + } + } +} + +static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree, + int num_4x4_blk) { + alloc_mode_context(cm, num_4x4_blk, &tree->none); + alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]); + alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]); + + /* TODO(Jbb): for 4x8 and 8x4 these allocated values are not used. + * Figure out a better way to do this. */ + alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]); + alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]); +} + +static void free_tree_contexts(PC_TREE *tree) { + free_mode_context(&tree->none); + free_mode_context(&tree->horizontal[0]); + free_mode_context(&tree->horizontal[1]); + free_mode_context(&tree->vertical[0]); + free_mode_context(&tree->vertical[1]); +} + +// This function sets up a tree of contexts such that at each square +// partition level. There are contexts for none, horizontal, vertical, and +// split. Along with a block_size value and a selected block_size which +// represents the state of our search. +void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) { + int i, j; + const int leaf_nodes = 64; + const int tree_nodes = 64 + 16 + 4 + 1; + int pc_tree_index = 0; + PC_TREE *this_pc; + PICK_MODE_CONTEXT *this_leaf; + int square_index = 1; + int nodes; + + vpx_free(cpi->leaf_tree); + CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes, + sizeof(*cpi->leaf_tree))); + vpx_free(cpi->pc_tree); + CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes, + sizeof(*cpi->pc_tree))); + + this_pc = &cpi->pc_tree[0]; + this_leaf = &cpi->leaf_tree[0]; + + // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same + // context so we only need to allocate 1 for each 8x8 block. + for (i = 0; i < leaf_nodes; ++i) + alloc_mode_context(cm, 1, &cpi->leaf_tree[i]); + + // Sets up all the leaf nodes in the tree. + for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) { + PC_TREE *const tree = &cpi->pc_tree[pc_tree_index]; + tree->block_size = square[0]; + alloc_tree_contexts(cm, tree, 4); + tree->leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) + tree->leaf_split[j] = tree->leaf_split[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = 16; nodes > 0; nodes >>= 2) { + for (i = 0; i < nodes; ++i) { + PC_TREE *const tree = &cpi->pc_tree[pc_tree_index]; + alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); + tree->block_size = square[square_index]; + for (j = 0; j < 4; j++) + tree->split[j] = this_pc++; + ++pc_tree_index; + } + ++square_index; + } + cpi->pc_root = &cpi->pc_tree[tree_nodes - 1]; + cpi->pc_root[0].none.best_mode_index = 2; +} + +void vp9_free_pc_tree(VP9_COMP *cpi) { + const int tree_nodes = 64 + 16 + 4 + 1; + int i; + + // Set up all 4x4 mode contexts + for (i = 0; i < 64; ++i) + free_mode_context(&cpi->leaf_tree[i]); + + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) + free_tree_contexts(&cpi->pc_tree[i]); + + vpx_free(cpi->pc_tree); + cpi->pc_tree = NULL; + vpx_free(cpi->leaf_tree); + cpi->leaf_tree = NULL; +} diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h new file mode 100644 index 000000000..d60e6c3eb --- /dev/null +++ b/libvpx/vp9/encoder/vp9_context_tree.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#define VP9_ENCODER_VP9_CONTEXT_TREE_H_ + +#include "vp9/common/vp9_onyxc_int.h" + +struct VP9_COMP; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct { + MODE_INFO mic; + uint8_t *zcoeff_blk; + int16_t *coeff[MAX_MB_PLANE][3]; + int16_t *qcoeff[MAX_MB_PLANE][3]; + int16_t *dqcoeff[MAX_MB_PLANE][3]; + uint16_t *eobs[MAX_MB_PLANE][3]; + + // dual buffer pointers, 0: in use, 1: best in store + int16_t *coeff_pbuf[MAX_MB_PLANE][3]; + int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; + int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; + uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; + + int is_coded; + int num_4x4_blk; + int skip; + int skip_txfm[MAX_MB_PLANE]; + int best_mode_index; + int hybrid_pred_diff; + int comp_pred_diff; + int single_pred_diff; + int64_t tx_rd_diff[TX_MODES]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + +#if CONFIG_VP9_TEMPORAL_DENOISING + unsigned int newmv_sse; + unsigned int zeromv_sse; + PREDICTION_MODE best_sse_inter_mode; + int_mv best_sse_mv; + MV_REFERENCE_FRAME best_reference_frame; + MV_REFERENCE_FRAME best_zeromv_reference_frame; +#endif + + // motion vector cache for adaptive motion search control in partition + // search loop + MV pred_mv[MAX_REF_FRAMES]; + INTERP_FILTER pred_interp_filter; +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + int index; + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT none; + PICK_MODE_CONTEXT horizontal[2]; + PICK_MODE_CONTEXT vertical[2]; + union { + struct PC_TREE *split[4]; + PICK_MODE_CONTEXT *leaf_split[4]; + }; +} PC_TREE; + +void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi); +void vp9_free_pc_tree(struct VP9_COMP *cpi); + +#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */ diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c index d5232393f..59222f0a9 100644 --- a/libvpx/vp9/encoder/vp9_dct.c +++ b/libvpx/vp9/encoder/vp9_dct.c @@ -43,6 +43,17 @@ static void fdct4(const int16_t *input, int16_t *output) { output[3] = fdct_round_shift(temp2); } +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) + sum += input[r * stride + c]; + + output[0] = sum << 1; + output[1] = 0; +} + void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -240,6 +251,17 @@ static void fdct8(const int16_t *input, int16_t *output) { output[7] = fdct_round_shift(t3); } +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) + sum += input[r * stride + c]; + + output[0] = sum; + output[1] = 0; +} + void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { int i, j; int16_t intermediate[64]; @@ -311,6 +333,17 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { } } +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 1; + output[1] = 0; +} + void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -445,20 +478,20 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { step3[7] = step1[7] + step2[4]; // step 4 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; step2[1] = fdct_round_shift(temp1); step2[2] = fdct_round_shift(temp2); - temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; step2[5] = fdct_round_shift(temp1); step2[6] = fdct_round_shift(temp2); // step 5 step1[0] = step3[0] + step2[1]; step1[1] = step3[0] - step2[1]; - step1[2] = step3[3] - step2[2]; - step1[3] = step3[3] + step2[2]; - step1[4] = step3[4] + step2[5]; - step1[5] = step3[4] - step2[5]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; step1[6] = step3[7] - step2[6]; step1[7] = step3[7] + step2[6]; // step 6 @@ -755,10 +788,10 @@ static void fdct16(const int16_t in[16], int16_t out[16]) { // step 4 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; step2[1] = fdct_round_shift(temp1); step2[2] = fdct_round_shift(temp2); - temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; step2[5] = fdct_round_shift(temp1); step2[6] = fdct_round_shift(temp2); @@ -766,10 +799,10 @@ static void fdct16(const int16_t in[16], int16_t out[16]) { // step 5 step1[0] = step3[0] + step2[1]; step1[1] = step3[0] - step2[1]; - step1[2] = step3[3] - step2[2]; - step1[3] = step3[3] + step2[2]; - step1[4] = step3[4] + step2[5]; - step1[5] = step3[4] - step2[5]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; step1[6] = step3[7] - step2[6]; step1[7] = step3[7] + step2[6]; @@ -1329,6 +1362,17 @@ static void fdct32(const int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 3; + output[1] = 0; +} + void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c new file mode 100644 index 000000000..90ea9cc25 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_denoiser.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <limits.h> +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" + +/* The VP9 denoiser is a work-in-progress. It currently is only designed to work + * with speed 6, though it (inexplicably) seems to also work with speed 5 (one + * would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to + * make the calls to the vp9_denoiser_* functions when in speed 5). + * + * The implementation is very similar to that of the VP8 denoiser. While + * choosing the motion vectors / reference frames, the denoiser is run, and if + * it did not modify the signal to much, the denoised block is copied to the + * signal. + */ + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv); +#endif + +static const int widths[] = {4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64}; +static const int heights[] = {4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64}; + +static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + return 3 + (increase_denoising ? 1 : 0); +} + +static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 4; +} + +static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 25 * 25; +} + +static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { + return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40); +} + +static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, + int mv_row, int mv_col) { + if (mv_row * mv_row + mv_col * mv_col > + noise_motion_thresh(bs, increase_denoising)) { + return 0; + } else { + return widths[bs] * heights[bs] * 20; + } +} + +static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) { + return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2); +} + +static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { + return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2); +} + +static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, + int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, + BLOCK_SIZE bs) { + int r, c; + const uint8_t *sig_start = sig; + const uint8_t *mc_avg_start = mc_avg; + uint8_t *avg_start = avg; + int diff, adj, absdiff, delta; + int adj_val[] = {3, 4, 6}; + int total_adj = 0; + + // First attempt to apply a strong temporal denoising filter. + for (r = 0; r < heights[bs]; ++r) { + for (c = 0; c < widths[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + absdiff = abs(diff); + + if (absdiff <= absdiff_thresh(bs, increase_denoising)) { + avg[c] = mc_avg[c]; + total_adj += diff; + } else { + switch (absdiff) { + case 4: case 5: case 6: case 7: + adj = adj_val[0]; + break; + case 8: case 9: case 10: case 11: + case 12: case 13: case 14: case 15: + adj = adj_val[1]; + break; + default: + adj = adj_val[2]; + } + if (diff > 0) { + avg[c] = MIN(UINT8_MAX, sig[c] + adj); + total_adj += adj; + } else { + avg[c] = MAX(0, sig[c] - adj); + total_adj -= adj; + } + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // If the strong filter did not modify the signal too much, we're all set. + if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + + // Otherwise, we try to dampen the filter if the delta is not too high. + delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) + >> 8) + 1; + if (delta > delta_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + + mc_avg = mc_avg_start; + avg = avg_start; + sig = sig_start; + for (r = 0; r < heights[bs]; ++r) { + for (c = 0; c < widths[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + adj = abs(diff); + if (adj > delta) { + adj = delta; + } + if (diff > 0) { + avg[c] = MAX(0, avg[c] - adj); + total_adj += adj; + } else { + avg[c] = MIN(UINT8_MAX, avg[c] + adj); + total_adj -= adj; + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // We can use the filter if it has been sufficiently dampened + if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + return COPY_BLOCK; +} + +static uint8_t *block_start(uint8_t *framebuf, int stride, + int mi_row, int mi_col) { + return framebuf + (stride * mi_row * 8) + (mi_col * 8); +} + +static void copy_block(uint8_t *dest, int dest_stride, + const uint8_t *src, int src_stride, BLOCK_SIZE bs) { + int r; + for (r = 0; r < heights[bs]; ++r) { + vpx_memcpy(dest, src, widths[bs]); + dest += dest_stride; + src += src_stride; + } +} + +static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, + MACROBLOCK *mb, + BLOCK_SIZE bs, + int increase_denoising, + int mi_row, + int mi_col, + PICK_MODE_CONTEXT *ctx + ) { + int mv_col, mv_row; + int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; + MV_REFERENCE_FRAME frame; + MACROBLOCKD *filter_mbd = &mb->e_mbd; + MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi; + + MB_MODE_INFO saved_mbmi; + int i, j; + struct buf_2d saved_dst[MAX_MB_PLANE]; + struct buf_2d saved_pre[MAX_MB_PLANE][2]; // 2 pre buffers + + // We will restore these after motion compensation. + saved_mbmi = *mbmi; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (j = 0; j < 2; ++j) { + saved_pre[i][j] = filter_mbd->plane[i].pre[j]; + } + saved_dst[i] = filter_mbd->plane[i].dst; + } + + mv_col = ctx->best_sse_mv.as_mv.col; + mv_row = ctx->best_sse_mv.as_mv.row; + + frame = ctx->best_reference_frame; + + // If the best reference frame uses inter-prediction and there is enough of a + // difference in sum-squared-error, use it. + if (frame != INTRA_FRAME && + sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) { + mbmi->ref_frame[0] = ctx->best_reference_frame; + mbmi->mode = ctx->best_sse_inter_mode; + mbmi->mv[0] = ctx->best_sse_mv; + } else { + // Otherwise, use the zero reference frame. + frame = ctx->best_zeromv_reference_frame; + + mbmi->ref_frame[0] = ctx->best_zeromv_reference_frame; + mbmi->mode = ZEROMV; + mbmi->mv[0].as_int = 0; + + ctx->best_sse_inter_mode = ZEROMV; + ctx->best_sse_mv.as_int = 0; + ctx->newmv_sse = ctx->zeromv_sse; + } + + // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser + // struct. + for (j = 0; j < 2; ++j) { + filter_mbd->plane[0].pre[j].buf = + block_start(denoiser->running_avg_y[frame].y_buffer, + denoiser->running_avg_y[frame].y_stride, + mi_row, mi_col); + filter_mbd->plane[0].pre[j].stride = + denoiser->running_avg_y[frame].y_stride; + filter_mbd->plane[1].pre[j].buf = + block_start(denoiser->running_avg_y[frame].u_buffer, + denoiser->running_avg_y[frame].uv_stride, + mi_row, mi_col); + filter_mbd->plane[1].pre[j].stride = + denoiser->running_avg_y[frame].uv_stride; + filter_mbd->plane[2].pre[j].buf = + block_start(denoiser->running_avg_y[frame].v_buffer, + denoiser->running_avg_y[frame].uv_stride, + mi_row, mi_col); + filter_mbd->plane[2].pre[j].stride = + denoiser->running_avg_y[frame].uv_stride; + } + filter_mbd->plane[0].dst.buf = + block_start(denoiser->mc_running_avg_y.y_buffer, + denoiser->mc_running_avg_y.y_stride, + mi_row, mi_col); + filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride; + filter_mbd->plane[1].dst.buf = + block_start(denoiser->mc_running_avg_y.u_buffer, + denoiser->mc_running_avg_y.uv_stride, + mi_row, mi_col); + filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride; + filter_mbd->plane[2].dst.buf = + block_start(denoiser->mc_running_avg_y.v_buffer, + denoiser->mc_running_avg_y.uv_stride, + mi_row, mi_col); + filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride; + + vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs); + + // Restore everything to its original state + *mbmi = saved_mbmi; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (j = 0; j < 2; ++j) { + filter_mbd->plane[i].pre[j] = saved_pre[i][j]; + } + filter_mbd->plane[i].dst = saved_dst[i]; + } + + mv_row = ctx->best_sse_mv.as_mv.row; + mv_col = ctx->best_sse_mv.as_mv.col; + + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + if (mv_row * mv_row + mv_col * mv_col > + 8 * noise_motion_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + return FILTER_BLOCK; +} + +void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, + int mi_row, int mi_col, BLOCK_SIZE bs, + PICK_MODE_CONTEXT *ctx) { + VP9_DENOISER_DECISION decision = FILTER_BLOCK; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; + uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride, + mi_row, mi_col); + struct buf_2d src = mb->plane[0].src; + + decision = perform_motion_compensation(denoiser, mb, bs, + denoiser->increase_denoising, + mi_row, mi_col, ctx); + + if (decision == FILTER_BLOCK) { + decision = denoiser_filter(src.buf, src.stride, + mc_avg_start, mc_avg.y_stride, + avg_start, avg.y_stride, + 0, bs); + } + + if (decision == FILTER_BLOCK) { + copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs); + } else { // COPY_BLOCK + copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs); + } +} + +static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) { + int r; + const uint8_t *srcbuf = src.y_buffer; + uint8_t *destbuf = dest.y_buffer; + assert(dest.y_width == src.y_width); + assert(dest.y_height == src.y_height); + + for (r = 0; r < dest.y_height; ++r) { + vpx_memcpy(destbuf, srcbuf, dest.y_width); + destbuf += dest.y_stride; + srcbuf += src.y_stride; + } +} + +void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, + YV12_BUFFER_CONFIG src, + FRAME_TYPE frame_type, + int refresh_alt_ref_frame, + int refresh_golden_frame, + int refresh_last_frame) { + if (frame_type == KEY_FRAME) { + int i; + // Start at 1 so as not to overwrite the INTRA_FRAME + for (i = 1; i < MAX_REF_FRAMES; ++i) { + copy_frame(denoiser->running_avg_y[i], src); + } + } else { /* For non key frames */ + if (refresh_alt_ref_frame) { + copy_frame(denoiser->running_avg_y[ALTREF_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_golden_frame) { + copy_frame(denoiser->running_avg_y[GOLDEN_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_last_frame) { + copy_frame(denoiser->running_avg_y[LAST_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + } +} + +void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { + ctx->zeromv_sse = UINT_MAX; + ctx->newmv_sse = UINT_MAX; +} + +void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi, + unsigned int sse, PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx) { + // TODO(tkopp): Use both MVs if possible + if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { + ctx->zeromv_sse = sse; + ctx->best_zeromv_reference_frame = mbmi->ref_frame[0]; + } + + if (mode == NEWMV) { + ctx->newmv_sse = sse; + ctx->best_sse_inter_mode = mode; + ctx->best_sse_mv = mbmi->mv[0]; + ctx->best_reference_frame = mbmi->ref_frame[0]; + } +} + +int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, + int ssx, int ssy, int border) { + int i, fail; + assert(denoiser != NULL); + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, + ssx, ssy, border); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, + ssx, ssy, border); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + denoiser->increase_denoising = 0; + + return 0; +} + +void vp9_denoiser_free(VP9_DENOISER *denoiser) { + int i; + if (denoiser == NULL) { + return; + } + for (i = 0; i < MAX_REF_FRAMES; ++i) { + if (&denoiser->running_avg_y[i] != NULL) { + vp9_free_frame_buffer(&denoiser->running_avg_y[i]); + } + } + if (&denoiser->mc_running_avg_y != NULL) { + vp9_free_frame_buffer(&denoiser->mc_running_avg_y); + } +} + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { + int r, c; + uint8_t *u = yuv->u_buffer; + uint8_t *v = yuv->v_buffer; + + // The '/2's are there because we have a 440 buffer, but we want to output + // 420. + for (r = 0; r < yuv->uv_height / 2; ++r) { + for (c = 0; c < yuv->uv_width / 2; ++c) { + u[c] = UINT8_MAX / 2; + v[c] = UINT8_MAX / 2; + } + u += yuv->uv_stride + yuv->uv_width / 2; + v += yuv->uv_stride + yuv->uv_width / 2; + } +} +#endif diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h new file mode 100644 index 000000000..d93846ff9 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_denoiser.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_DENOISER_H_ +#define VP9_ENCODER_DENOISER_H_ + +#include "vp9/encoder/vp9_block.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vp9_denoiser_decision { + COPY_BLOCK, + FILTER_BLOCK +} VP9_DENOISER_DECISION; + +typedef struct vp9_denoiser { + YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; + YV12_BUFFER_CONFIG mc_running_avg_y; + int increase_denoising; +} VP9_DENOISER; + +void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, + YV12_BUFFER_CONFIG src, + FRAME_TYPE frame_type, + int refresh_alt_ref_frame, + int refresh_golden_frame, + int refresh_last_frame); + +void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, + int mi_row, int mi_col, BLOCK_SIZE bs, + PICK_MODE_CONTEXT *ctx); + +void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); + +void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi, + unsigned int sse, PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx); + +int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, + int ssx, int ssy, int border); + +void vp9_denoiser_free(VP9_DENOISER *denoiser); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_DENOISER_H_ diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index 61a5022ec..4e7b8e4a2 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -38,6 +38,7 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_pickmode.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_tokenize.h" @@ -48,41 +49,9 @@ #define SPLIT_MV_ZBIN_BOOST 0 #define INTRA_ZBIN_BOOST 0 -static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) { - switch (subsize) { - case BLOCK_64X64: - case BLOCK_64X32: - case BLOCK_32X64: - case BLOCK_32X32: - return &x->sb_index; - case BLOCK_32X16: - case BLOCK_16X32: - case BLOCK_16X16: - return &x->mb_index; - case BLOCK_16X8: - case BLOCK_8X16: - case BLOCK_8X8: - return &x->b_index; - case BLOCK_8X4: - case BLOCK_4X8: - case BLOCK_4X4: - return &x->ab_index; - default: - assert(0); - return NULL; - } -} - static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, - int mi_row, int mi_col, BLOCK_SIZE bsize); - -static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); - -// activity_avg must be positive, or flat regions could get a zero weight -// (infinite lambda), which confounds analysis. -// This also avoids the need for divide by zero checks in -// vp9_activity_masking(). -#define ACTIVITY_AVG_MIN 64 + int mi_row, int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); // Motion vector component magnitude threshold for defining fast motion. #define FAST_MOTION_MV_THRESH 24 @@ -103,34 +72,31 @@ static const uint8_t VP9_VAR_OFFS[64] = { }; static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, - MACROBLOCK *x, + const struct buf_2d *ref, BLOCK_SIZE bs) { - unsigned int var, sse; - var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, - VP9_VAR_OFFS, 0, &sse); + unsigned int sse; + const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + VP9_VAR_OFFS, 0, &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, - MACROBLOCK *x, - int mi_row, - int mi_col, + const struct buf_2d *ref, + int mi_row, int mi_col, BLOCK_SIZE bs) { - const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); - int offset = (mi_row * MI_SIZE) * yv12->y_stride + (mi_col * MI_SIZE); - unsigned int var, sse; - var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, - x->plane[0].src.stride, - yv12->y_buffer + offset, - yv12->y_stride, - &sse); + const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME); + const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride + + mi_col * MI_SIZE]; + unsigned int sse; + const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + last_y, last->y_stride, &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, int mi_row, int mi_col) { - unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb, + unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, mi_row, mi_col, BLOCK_64X64); if (var < 8) @@ -146,7 +112,7 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi, int mi_row, int mi_col) { - unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb, + unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, mi_row, mi_col, BLOCK_64X64); if (var < 4) @@ -168,42 +134,6 @@ static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm, xd->mi[0] = cm->mi + idx_str; } -static int is_block_in_mb_map(const VP9_COMP *cpi, int mi_row, int mi_col, - BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; - const int mb_rows = cm->mb_rows; - const int mb_cols = cm->mb_cols; - const int mb_row = mi_row >> 1; - const int mb_col = mi_col >> 1; - const int mb_width = num_8x8_blocks_wide_lookup[bsize] >> 1; - const int mb_height = num_8x8_blocks_high_lookup[bsize] >> 1; - int r, c; - if (bsize <= BLOCK_16X16) { - return cpi->active_map[mb_row * mb_cols + mb_col]; - } - for (r = 0; r < mb_height; ++r) { - for (c = 0; c < mb_width; ++c) { - int row = mb_row + r; - int col = mb_col + c; - if (row >= mb_rows || col >= mb_cols) - continue; - if (cpi->active_map[row * mb_cols + col]) - return 1; - } - } - return 0; -} - -static int check_active_map(const VP9_COMP *cpi, const MACROBLOCK *x, - int mi_row, int mi_col, - BLOCK_SIZE bsize) { - if (cpi->active_map_enabled && !x->e_mbd.lossless) { - return is_block_in_mb_map(cpi, mi_row, mi_col, bsize); - } else { - return 1; - } -} - static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, int mi_col, BLOCK_SIZE bsize) { MACROBLOCK *const x = &cpi->mb; @@ -212,23 +142,16 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, MB_MODE_INFO *mbmi; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mb_row = mi_row >> 1; - const int mb_col = mi_col >> 1; - const int idx_map = mb_row * cm->mb_cols + mb_col; const struct segmentation *const seg = &cm->seg; set_skip_context(xd, mi_row, mi_col); - // Activity map pointer - x->mb_activity_ptr = &cpi->mb_activity_map[idx_map]; - x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize); - set_modeinfo_offsets(cm, xd, mi_row, mi_col); mbmi = &xd->mi[0]->mbmi; // Set up destination pointers. - vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col); + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); // Set up limit values for MV components. // Mv beyond the range do not produce new/different prediction block. @@ -246,8 +169,8 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); // R/D setup. - x->rddiv = cpi->RDDIV; - x->rdmult = cpi->RDMULT; + x->rddiv = cpi->rd.RDDIV; + x->rdmult = cpi->rd.RDMULT; // Setup segment ID. if (seg->enabled) { @@ -265,11 +188,9 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, } } -static void duplicate_mode_info_in_sb(VP9_COMMON * const cm, - MACROBLOCKD *const xd, - int mi_row, - int mi_col, - BLOCK_SIZE bsize) { +static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { const int block_width = num_8x8_blocks_wide_lookup[bsize]; const int block_height = num_8x8_blocks_high_lookup[bsize]; int i, j; @@ -281,7 +202,6 @@ static void duplicate_mode_info_in_sb(VP9_COMMON * const cm, } static void set_block_size(VP9_COMP * const cpi, - const TileInfo *const tile, int mi_row, int mi_col, BLOCK_SIZE bsize) { if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) { @@ -338,6 +258,8 @@ typedef enum { static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { int i; + node->part_variances = NULL; + vpx_memset(node->split, 0, sizeof(node->split)); switch (bsize) { case BLOCK_64X64: { v64x64 *vt = (v64x64 *) data; @@ -369,6 +291,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { } default: { assert(0); + break; } } } @@ -404,11 +327,9 @@ static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { static int set_vt_partitioning(VP9_COMP *cpi, void *data, - const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, - int mi_col, - int mi_size) { + int mi_col) { VP9_COMMON * const cm = &cpi->common; variance_node vt; const int block_width = num_8x8_blocks_wide_lookup[bsize]; @@ -425,7 +346,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && vt.part_variances->none.variance < threshold) { - set_block_size(cpi, tile, mi_row, mi_col, bsize); + set_block_size(cpi, mi_row, mi_col, bsize); return 1; } @@ -434,8 +355,8 @@ static int set_vt_partitioning(VP9_COMP *cpi, vt.part_variances->vert[0].variance < threshold && vt.part_variances->vert[1].variance < threshold) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); - set_block_size(cpi, tile, mi_row, mi_col, subsize); - set_block_size(cpi, tile, mi_row, mi_col + block_width / 2, subsize); + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); return 1; } @@ -444,8 +365,8 @@ static int set_vt_partitioning(VP9_COMP *cpi, vt.part_variances->horz[0].variance < threshold && vt.part_variances->horz[1].variance < threshold) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); - set_block_size(cpi, tile, mi_row, mi_col, subsize); - set_block_size(cpi, tile, mi_row + block_height / 2, mi_col, subsize); + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); return 1; } return 0; @@ -514,8 +435,8 @@ static void choose_partitioning(VP9_COMP *cpi, unsigned int sse = 0; int sum = 0; if (x_idx < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp, - d + y_idx * dp + x_idx, dp, &sse, &sum); + vp9_get8x8var(s + y_idx * sp + x_idx, sp, + d + y_idx * dp + x_idx, dp, &sse, &sum); fill_variance(sse, sum, 64, &vst->split[k].part_variances.none); } } @@ -532,13 +453,13 @@ static void choose_partitioning(VP9_COMP *cpi, // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold, or we // hit 8x8. - if (!set_vt_partitioning(cpi, &vt, tile, BLOCK_64X64, - mi_row, mi_col, 8)) { + if (!set_vt_partitioning(cpi, &vt, BLOCK_64X64, + mi_row, mi_col)) { for (i = 0; i < 4; ++i) { const int x32_idx = ((i & 1) << 2); const int y32_idx = ((i >> 1) << 2); - if (!set_vt_partitioning(cpi, &vt.split[i], tile, BLOCK_32X32, - (mi_row + y32_idx), (mi_col + x32_idx), 4)) { + if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32, + (mi_row + y32_idx), (mi_col + x32_idx))) { for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); @@ -548,7 +469,7 @@ static void choose_partitioning(VP9_COMP *cpi, #ifdef DISABLE_8X8_VAR_BASED_PARTITION if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows && mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) { - set_block_size(cpi, tile, + set_block_size(cpi, (mi_row + y32_idx + y16_idx), (mi_col + x32_idx + x16_idx), BLOCK_16X16); @@ -556,7 +477,7 @@ static void choose_partitioning(VP9_COMP *cpi, for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); - set_block_size(cpi, tile, + set_block_size(cpi, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8); @@ -570,7 +491,7 @@ static void choose_partitioning(VP9_COMP *cpi, for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); - set_block_size(cpi, tile, + set_block_size(cpi, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8); @@ -583,245 +504,12 @@ static void choose_partitioning(VP9_COMP *cpi, } } -// Original activity measure from Tim T's code. -static unsigned int tt_activity_measure(MACROBLOCK *x) { - unsigned int sse; - // TODO: This could also be done over smaller areas (8x8), but that would - // require extensive changes elsewhere, as lambda is assumed to be fixed - // over an entire MB in most of the code. - // Another option is to compute four 8x8 variances, and pick a single - // lambda using a non-linear combination (e.g., the smallest, or second - // smallest, etc.). - const unsigned int act = vp9_variance16x16(x->plane[0].src.buf, - x->plane[0].src.stride, - VP9_VAR_OFFS, 0, &sse) << 4; - // If the region is flat, lower the activity some more. - return act < (8 << 12) ? MIN(act, 5 << 12) : act; -} - -// Stub for alternative experimental activity measures. -static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) { - return vp9_encode_intra(x, use_dc_pred); -} - -// Measure the activity of the current macroblock -// What we measure here is TBD so abstracted to this function -#define ALT_ACT_MEASURE 1 -static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) { - unsigned int mb_activity; - - if (ALT_ACT_MEASURE) { - const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - - // Or use and alternative. - mb_activity = alt_activity_measure(x, use_dc_pred); - } else { - // Original activity measure from Tim T's code. - mb_activity = tt_activity_measure(x); - } - - return MAX(mb_activity, ACTIVITY_AVG_MIN); -} - -// Calculate an "average" mb activity value for the frame -#define ACT_MEDIAN 0 -static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { -#if ACT_MEDIAN - // Find median: Simple n^2 algorithm for experimentation - { - unsigned int median; - unsigned int i, j; - unsigned int *sortlist; - unsigned int tmp; - - // Create a list to sort to - CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int), - cpi->common.MBs)); - - // Copy map to sort list - vpx_memcpy(sortlist, cpi->mb_activity_map, - sizeof(unsigned int) * cpi->common.MBs); - - // Ripple each value down to its correct position - for (i = 1; i < cpi->common.MBs; i ++) { - for (j = i; j > 0; j --) { - if (sortlist[j] < sortlist[j - 1]) { - // Swap values - tmp = sortlist[j - 1]; - sortlist[j - 1] = sortlist[j]; - sortlist[j] = tmp; - } else { - break; - } - } - } - - // Even number MBs so estimate median as mean of two either side. - median = (1 + sortlist[cpi->common.MBs >> 1] + - sortlist[(cpi->common.MBs >> 1) + 1]) >> 1; - - cpi->activity_avg = median; - - vpx_free(sortlist); - } -#else - // Simple mean for now - cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs); -#endif // ACT_MEDIAN - - if (cpi->activity_avg < ACTIVITY_AVG_MIN) - cpi->activity_avg = ACTIVITY_AVG_MIN; - - // Experimental code: return fixed value normalized for several clips - if (ALT_ACT_MEASURE) - cpi->activity_avg = 100000; -} - -#define USE_ACT_INDEX 0 -#define OUTPUT_NORM_ACT_STATS 0 - -#if USE_ACT_INDEX -// Calculate an activity index for each mb -static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { - VP9_COMMON *const cm = &cpi->common; - int mb_row, mb_col; - - int64_t act; - int64_t a; - int64_t b; - -#if OUTPUT_NORM_ACT_STATS - FILE *f = fopen("norm_act.stt", "a"); - fprintf(f, "\n%12d\n", cpi->activity_avg); -#endif - - // Reset pointers to start of activity map - x->mb_activity_ptr = cpi->mb_activity_map; - - // Calculate normalized mb activity number. - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - // for each macroblock col in image - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - // Read activity from the map - act = *(x->mb_activity_ptr); - - // Calculate a normalized activity number - a = act + 4 * cpi->activity_avg; - b = 4 * act + cpi->activity_avg; - - if (b >= a) - *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; - else - *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); - -#if OUTPUT_NORM_ACT_STATS - fprintf(f, " %6d", *(x->mb_activity_ptr)); -#endif - // Increment activity map pointers - x->mb_activity_ptr++; - } - -#if OUTPUT_NORM_ACT_STATS - fprintf(f, "\n"); -#endif - } - -#if OUTPUT_NORM_ACT_STATS - fclose(f); -#endif -} -#endif // USE_ACT_INDEX - -// Loop through all MBs. Note activity of each, average activity and -// calculate a normalized activity for each -static void build_activity_map(VP9_COMP *cpi) { - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *const cm = &cpi->common; - -#if ALT_ACT_MEASURE - YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm); - int recon_yoffset; - int recon_y_stride = new_yv12->y_stride; -#endif - - int mb_row, mb_col; - unsigned int mb_activity; - int64_t activity_sum = 0; - - x->mb_activity_ptr = cpi->mb_activity_map; - - // for each macroblock row in image - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { -#if ALT_ACT_MEASURE - // reset above block coeffs - xd->up_available = (mb_row != 0); - recon_yoffset = (mb_row * recon_y_stride * 16); -#endif - // for each macroblock col in image - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { -#if ALT_ACT_MEASURE - xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; - xd->left_available = (mb_col != 0); - recon_yoffset += 16; -#endif - - // measure activity - mb_activity = mb_activity_measure(x, mb_row, mb_col); - - // Keep frame sum - activity_sum += mb_activity; - - // Store MB level activity details. - *x->mb_activity_ptr = mb_activity; - - // Increment activity map pointer - x->mb_activity_ptr++; - - // adjust to the next column of source macroblocks - x->plane[0].src.buf += 16; - } - - // adjust to the next row of mbs - x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; - } - - // Calculate an "average" MB activity - calc_av_activity(cpi, activity_sum); - -#if USE_ACT_INDEX - // Calculate an activity index number of each mb - calc_activity_index(cpi, x); -#endif -} - -// Macroblock activity masking -static void activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { -#if USE_ACT_INDEX - x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2); - x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); - x->errorperbit += (x->errorperbit == 0); -#else - const int64_t act = *(x->mb_activity_ptr); - - // Apply the masking to the RD multiplier. - const int64_t a = act + (2 * cpi->activity_avg); - const int64_t b = (2 * act) + cpi->activity_avg; - - x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a); - x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); - x->errorperbit += (x->errorperbit == 0); -#endif - - // Activity based Zbin adjustment - adjust_act_zbin(cpi, x); -} - static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, int output_enabled) { int i, x_idx, y; VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; @@ -907,7 +595,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { for (i = 0; i < TX_MODES; i++) - cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i]; + rd_opt->tx_select_diff[i] += ctx->tx_rd_diff[i]; } #if CONFIG_INTERNAL_STATS @@ -940,21 +628,19 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } } - cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; - cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; - cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; + rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; + rd_opt->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; + rd_opt->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) - cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; + rd_opt->filter_diff[i] += ctx->best_filter_diff[i]; } } void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { - uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; + uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer }; + const int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride }; int i; // Set current frame pointer. @@ -966,11 +652,42 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, x->e_mbd.plane[i].subsampling_y); } +static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate, + int64_t *dist, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + INTERP_FILTER filter_ref; + + if (xd->up_available) + filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + else if (xd->left_available) + filter_ref = xd->mi[-1]->mbmi.interp_filter; + else + filter_ref = EIGHTTAP; + + mbmi->sb_type = bsize; + mbmi->mode = ZEROMV; + mbmi->tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[tx_mode]); + mbmi->skip = 1; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + mbmi->interp_filter = filter_ref; + + xd->mi[0]->bmi[0].as_mv[0].as_int = 0; + x->skip = 1; + + *rate = 0; + *dist = 0; +} + static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, int mi_col, int *totalrate, int64_t *totaldist, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd) { + int64_t best_rd, int block) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -987,10 +704,13 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; + // TODO(JBB): Most other places in the code instead of calling the function + // and then checking if its not the first 8x8 we put the check in the + // calling function. Do that here. if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (x->ab_index != 0) { + if (block != 0) { *totalrate = 0; *totaldist = 0; return; @@ -1013,12 +733,14 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, // Set to zero to make sure we do not use the previous encoded frame stats mbmi->skip = 0; - x->source_variance = get_sby_perpixel_variance(cpi, x, bsize); + x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + + // Save rdmult before it might be changed, so it can be restored later. + orig_rdmult = x->rdmult; if (aq_mode == VARIANCE_AQ) { const int energy = bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { @@ -1031,14 +753,6 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, rdmult_ratio = vp9_vaq_rdmult_ratio(energy); vp9_init_plane_quantizers(cpi, x); - } - - // Save rdmult before it might be changed, so it can be restored later. - orig_rdmult = x->rdmult; - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) - activity_masking(cpi, x); - - if (aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); x->rdmult = (int)round(x->rdmult * rdmult_ratio); } else if (aq_mode == COMPLEXITY_AQ) { @@ -1062,28 +776,28 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx, best_rd); } else { - if (bsize >= BLOCK_8X8) - vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, - totalrate, totaldist, bsize, ctx, best_rd); - else + if (bsize >= BLOCK_8X8) { + if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, totalrate, totaldist, bsize, + ctx, best_rd); + else + vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, + totalrate, totaldist, bsize, ctx, best_rd); + } else { vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate, totaldist, bsize, ctx, best_rd); + } } - if (aq_mode == VARIANCE_AQ) { - x->rdmult = orig_rdmult; - if (*totalrate != INT_MAX) { - vp9_clear_system_state(); - *totalrate = (int)round(*totalrate * rdmult_ratio); - } - } else if (aq_mode == COMPLEXITY_AQ || aq_mode == CYCLIC_REFRESH_AQ) { - x->rdmult = orig_rdmult; + x->rdmult = orig_rdmult; + + if (aq_mode == VARIANCE_AQ && *totalrate != INT_MAX) { + vp9_clear_system_state(); + *totalrate = (int)round(*totalrate * rdmult_ratio); } } -static void update_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const MACROBLOCK *const x = &cpi->mb; +static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *const xd = &x->e_mbd; const MODE_INFO *const mi = xd->mi[0]; const MB_MODE_INFO *const mbmi = &mi->mbmi; @@ -1122,22 +836,6 @@ static void update_stats(VP9_COMP *cpi) { } } -static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) { - switch (bsize) { - case BLOCK_64X64: - return &x->sb64_partitioning; - case BLOCK_32X32: - return &x->sb_partitioning[x->sb_index]; - case BLOCK_16X16: - return &x->mb_partitioning[x->sb_index][x->mb_index]; - case BLOCK_8X8: - return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index]; - default: - assert(0); - return NULL; - } -} - static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], @@ -1168,6 +866,7 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl, sizeof(xd->left_seg_context[0]) * mi_height); } + static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], @@ -1203,22 +902,14 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize) { - MACROBLOCK *const x = &cpi->mb; - - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index > 0) - return; - } + int output_enabled, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { set_offsets(cpi, tile, mi_row, mi_col, bsize); - update_state(cpi, get_block_context(x, bsize), mi_row, mi_col, bsize, - output_enabled); - encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize); + update_state(cpi, ctx, mi_row, mi_col, bsize, output_enabled); + encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx); if (output_enabled) { - update_stats(cpi); + update_stats(&cpi->common, &cpi->mb); (*tp)->token = EOSB_TOKEN; (*tp)++; @@ -1227,7 +918,8 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize) { + int output_enabled, BLOCK_SIZE bsize, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -1235,64 +927,62 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; int ctx; PARTITION_TYPE partition; - BLOCK_SIZE subsize; + BLOCK_SIZE subsize = bsize; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (bsize >= BLOCK_8X8) { ctx = partition_plane_context(xd, mi_row, mi_col, bsize); - subsize = *get_sb_partitioning(x, bsize); + subsize = get_subsize(bsize, pc_tree->partitioning); } else { ctx = 0; subsize = BLOCK_4X4; } partition = partition_lookup[bsl][subsize]; + if (output_enabled && bsize != BLOCK_4X4) + cm->counts.partition[ctx][partition]++; switch (partition) { case PARTITION_NONE: - if (output_enabled && bsize >= BLOCK_8X8) - cm->counts.partition[ctx][PARTITION_NONE]++; - encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->none); break; case PARTITION_VERT: - if (output_enabled) - cm->counts.partition[ctx][PARTITION_VERT]++; - *get_sb_index(x, subsize) = 0; - encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); - if (mi_col + hbs < cm->mi_cols) { - *get_sb_index(x, subsize) = 1; - encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { + encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize, + &pc_tree->vertical[1]); } break; case PARTITION_HORZ: - if (output_enabled) - cm->counts.partition[ctx][PARTITION_HORZ]++; - *get_sb_index(x, subsize) = 0; - encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); - if (mi_row + hbs < cm->mi_rows) { - *get_sb_index(x, subsize) = 1; - encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { + encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize, + &pc_tree->horizontal[1]); } break; case PARTITION_SPLIT: - subsize = get_subsize(bsize, PARTITION_SPLIT); - if (output_enabled) - cm->counts.partition[ctx][PARTITION_SPLIT]++; - - *get_sb_index(x, subsize) = 0; - encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); - *get_sb_index(x, subsize) = 1; - encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize); - *get_sb_index(x, subsize) = 2; - encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize); - *get_sb_index(x, subsize) = 3; - encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, - subsize); + if (bsize == BLOCK_8X8) { + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->leaf_split[0]); + } else { + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->split[0]); + encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize, + pc_tree->split[1]); + encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize, + pc_tree->split[2]); + encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, + subsize, pc_tree->split[3]); + } break; default: assert("Invalid partition type."); + break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -1319,6 +1009,22 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, return bsize; } +static void set_partial_b64x64_partition(MODE_INFO *mi, int mis, + int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining, + BLOCK_SIZE bsize, MODE_INFO **mi_8x8) { + int bh = bh_in; + int r, c; + for (r = 0; r < MI_BLOCK_SIZE; r += bh) { + int bw = bw_in; + for (c = 0; c < MI_BLOCK_SIZE; c += bw) { + const int index = r * mis + c; + mi_8x8[index] = mi + index; + mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize, + row8x8_remaining - r, col8x8_remaining - c, &bh, &bw); + } + } +} + // This function attempts to set all mode info entries in a given SB64 // to the same block partition size. // However, at the bottom and right borders of the image the requested size @@ -1329,8 +1035,8 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; const int mis = cm->mi_stride; - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; int block_row, block_col; MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; int bh = num_8x8_blocks_high_lookup[bsize]; @@ -1350,15 +1056,25 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } else { // Else this is a partial SB64. - for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { - for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { - int index = block_row * mis + block_col; - // Find a partition size that fits - bsize = find_partition_size(bsize, - (row8x8_remaining - block_row), - (col8x8_remaining - block_col), &bh, &bw); - mi_8x8[index] = mi_upper_left + index; - mi_8x8[index]->mbmi.sb_type = bsize; + set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining, + col8x8_remaining, bsize, mi_8x8); + } +} + +static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8, + MODE_INFO **prev_mi_8x8) { + const int mis = cm->mi_stride; + int block_row, block_col; + + for (block_row = 0; block_row < 8; ++block_row) { + for (block_col = 0; block_col < 8; ++block_col) { + MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col]; + const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; + + if (prev_mi) { + const ptrdiff_t offset = prev_mi - cm->prev_mi; + mi_8x8[block_row * mis + block_col] = cm->mi + offset; + mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type; } } } @@ -1413,36 +1129,7 @@ static void constrain_copy_partitioning(VP9_COMP *const cpi, } } else { // Else this is a partial SB64, copy previous partition. - for (block_row = 0; block_row < 8; ++block_row) { - for (block_col = 0; block_col < 8; ++block_col) { - MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col]; - const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; - if (prev_mi) { - const ptrdiff_t offset = prev_mi - cm->prev_mi; - mi_8x8[block_row * mis + block_col] = cm->mi + offset; - mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type; - } - } - } - } -} - -static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8, - MODE_INFO **prev_mi_8x8) { - const int mis = cm->mi_stride; - int block_row, block_col; - - for (block_row = 0; block_row < 8; ++block_row) { - for (block_col = 0; block_col < 8; ++block_col) { - MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col]; - const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; - - if (prev_mi) { - const ptrdiff_t offset = prev_mi - cm->prev_mi; - mi_8x8[block_row * mis + block_col] = cm->mi + offset; - mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type; - } - } + copy_partitioning(cm, mi_8x8, prev_mi_8x8); } } @@ -1465,47 +1152,39 @@ static void set_source_var_based_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *x = &cpi->mb; + MACROBLOCK *const x = &cpi->mb; const int mis = cm->mi_stride; - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; - int r, c; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); // In-image SB64 if ((col8x8_remaining >= MI_BLOCK_SIZE) && (row8x8_remaining >= MI_BLOCK_SIZE)) { - const int src_stride = x->plane[0].src.stride; - const int pre_stride = cpi->Last_Source->y_stride; - const uint8_t *src = x->plane[0].src.buf; - const int pre_offset = (mi_row * MI_SIZE) * pre_stride + - (mi_col * MI_SIZE); - const uint8_t *pre_src = cpi->Last_Source->y_buffer + pre_offset; - const int thr_32x32 = cpi->sf.source_var_thresh; - const int thr_64x64 = thr_32x32 << 1; int i, j; int index; diff d32[4]; - int use16x16 = 0; + const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); + int is_larger_better = 0; + int use32x32 = 0; + unsigned int thr = cpi->source_var_thresh; + + vpx_memset(d32, 0, 4 * sizeof(diff)); for (i = 0; i < 4; i++) { - diff d16[4]; + diff *d16[4]; for (j = 0; j < 4; j++) { int b_mi_row = coord_lookup[i * 4 + j].row; int b_mi_col = coord_lookup[i * 4 + j].col; - int b_offset = b_mi_row * MI_SIZE * src_stride + - b_mi_col * MI_SIZE; + int boffset = b_mi_row / 2 * cm->mb_cols + + b_mi_col / 2; - vp9_get_sse_sum_16x16(src + b_offset, - src_stride, - pre_src + b_offset, - pre_stride, &d16[j].sse, &d16[j].sum); - - d16[j].var = d16[j].sse - - (((uint32_t)d16[j].sum * d16[j].sum) >> 8); + d16[j] = cpi->source_diff_var + offset + boffset; index = b_mi_row * mis + b_mi_col; mi_8x8[index] = mi_upper_left + index; @@ -1515,14 +1194,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi, // size to further improve quality. } - if (d16[0].var < thr_32x32 && d16[1].var < thr_32x32 && - d16[2].var < thr_32x32 && d16[3].var < thr_32x32) { - d32[i].sse = d16[0].sse; - d32[i].sum = d16[0].sum; + is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) && + (d16[2]->var < thr) && (d16[3]->var < thr); + + // Use 32x32 partition + if (is_larger_better) { + use32x32 += 1; - for (j = 1; j < 4; j++) { - d32[i].sse += d16[j].sse; - d32[i].sum += d16[j].sum; + for (j = 0; j < 4; j++) { + d32[i].sse += d16[j]->sse; + d32[i].sum += d16[j]->sum; } d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10); @@ -1530,42 +1211,68 @@ static void set_source_var_based_partition(VP9_COMP *cpi, index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col; mi_8x8[index] = mi_upper_left + index; mi_8x8[index]->mbmi.sb_type = BLOCK_32X32; - - if (!((cm->current_video_frame - 1) % - cpi->sf.search_type_check_frequency)) - cpi->use_large_partition_rate += 1; - } else { - use16x16 = 1; } } - if (!use16x16) { - if (d32[0].var < thr_64x64 && d32[1].var < thr_64x64 && - d32[2].var < thr_64x64 && d32[3].var < thr_64x64) { + if (use32x32 == 4) { + thr <<= 1; + is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) && + (d32[2].var < thr) && (d32[3].var < thr); + + // Use 64x64 partition + if (is_larger_better) { mi_8x8[0] = mi_upper_left; mi_8x8[0]->mbmi.sb_type = BLOCK_64X64; } } } else { // partial in-image SB64 - BLOCK_SIZE bsize = BLOCK_16X16; - int bh = num_8x8_blocks_high_lookup[bsize]; - int bw = num_8x8_blocks_wide_lookup[bsize]; - - for (r = 0; r < MI_BLOCK_SIZE; r += bh) { - for (c = 0; c < MI_BLOCK_SIZE; c += bw) { - int index = r * mis + c; - // Find a partition size that fits - bsize = find_partition_size(bsize, - (row8x8_remaining - r), - (col8x8_remaining - c), &bh, &bw); - mi_8x8[index] = mi_upper_left + index; - mi_8x8[index]->mbmi.sb_type = bsize; - } - } + int bh = num_8x8_blocks_high_lookup[BLOCK_16X16]; + int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16]; + set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, + row8x8_remaining, col8x8_remaining, BLOCK_16X16, mi_8x8); } } -static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) { +static int is_background(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col) { + MACROBLOCK *x = &cpi->mb; + uint8_t *src, *pre; + int src_stride, pre_stride; + + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + + int this_sad = 0; + int threshold = 0; + + // This assumes the input source frames are of the same dimension. + src_stride = cpi->Source->y_stride; + src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride + + (mi_col * MI_SIZE); + pre_stride = cpi->Last_Source->y_stride; + pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride + + (mi_col * MI_SIZE); + + if (row8x8_remaining >= MI_BLOCK_SIZE && + col8x8_remaining >= MI_BLOCK_SIZE) { + this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, + pre, pre_stride); + threshold = (1 << 12); + } else { + int r, c; + for (r = 0; r < row8x8_remaining; r += 2) + for (c = 0; c < col8x8_remaining; c += 2) + this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, + pre, pre_stride); + threshold = (row8x8_remaining * col8x8_remaining) << 6; + } + + x->in_static_area = (this_sad < 2 * threshold); + return x->in_static_area; +} + +static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8, + const int motion_thresh) { const int mis = cm->mi_stride; int block_row, block_col; @@ -1574,8 +1281,8 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) { for (block_col = 0; block_col < 8; ++block_col) { const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col]; if (prev_mi) { - if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 || - abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8) + if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh || + abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh) return 1; } } @@ -1612,25 +1319,25 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } x->skip = ctx->skip; + x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0]; } static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize) { - MACROBLOCK *const x = &cpi->mb; + int output_enabled, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + update_state_rt(cpi, ctx, mi_row, mi_col, bsize); - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index > 0) - return; +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) { + vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col, + MAX(BLOCK_8X8, bsize), ctx); } +#endif - set_offsets(cpi, tile, mi_row, mi_col, bsize); - update_state_rt(cpi, get_block_context(x, bsize), mi_row, mi_col, bsize); - - encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize); - update_stats(cpi); + encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx); + update_stats(&cpi->common, &cpi->mb); (*tp)->token = EOSB_TOKEN; (*tp)++; @@ -1638,7 +1345,8 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize) { + int output_enabled, BLOCK_SIZE bsize, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -1652,7 +1360,6 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, return; if (bsize >= BLOCK_8X8) { - MACROBLOCKD *const xd = &cpi->mb.e_mbd; const int idx_str = xd->mi_stride * mi_row + mi_col; MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str; ctx = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -1663,54 +1370,44 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, } partition = partition_lookup[bsl][subsize]; + if (output_enabled && bsize != BLOCK_4X4) + cm->counts.partition[ctx][partition]++; switch (partition) { case PARTITION_NONE: - if (output_enabled && bsize >= BLOCK_8X8) - cm->counts.partition[ctx][PARTITION_NONE]++; - encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); + encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->none); break; case PARTITION_VERT: - if (output_enabled) - cm->counts.partition[ctx][PARTITION_VERT]++; - *get_sb_index(x, subsize) = 0; - encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); - if (mi_col + hbs < cm->mi_cols) { - *get_sb_index(x, subsize) = 1; + encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize); + subsize, &pc_tree->vertical[1]); } break; case PARTITION_HORZ: - if (output_enabled) - cm->counts.partition[ctx][PARTITION_HORZ]++; - *get_sb_index(x, subsize) = 0; - encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); - if (mi_row + hbs < cm->mi_rows) { - *get_sb_index(x, subsize) = 1; + encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize); + subsize, &pc_tree->horizontal[1]); } break; case PARTITION_SPLIT: subsize = get_subsize(bsize, PARTITION_SPLIT); - if (output_enabled) - cm->counts.partition[ctx][PARTITION_SPLIT]++; - - *get_sb_index(x, subsize) = 0; - encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize); - *get_sb_index(x, subsize) = 1; + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->split[0]); encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize); - *get_sb_index(x, subsize) = 2; + subsize, pc_tree->split[1]); encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize); - *get_sb_index(x, subsize) = 3; + subsize, pc_tree->split[2]); encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, - subsize); + subsize, pc_tree->split[3]); break; default: assert("Invalid partition type."); + break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -1722,7 +1419,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, - int do_recon) { + int do_recon, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -1748,6 +1445,7 @@ static void rd_use_partition(VP9_COMP *cpi, int splits_below = 0; BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type; int do_partition_search = 1; + PICK_MODE_CONTEXT *ctx = &pc_tree->none; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -1758,36 +1456,14 @@ static void rd_use_partition(VP9_COMP *cpi, partition = partition_lookup[bsl][bs_type]; subsize = get_subsize(bsize, partition); - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index != 0) { - *rate = 0; - *dist = 0; - return; - } - } else { - *(get_sb_partitioning(x, bsize)) = subsize; - } + pc_tree->partitioning = partition; save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (bsize == BLOCK_16X16) { + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) { set_offsets(cpi, tile, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); - } else { - x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize); } - if (!x->in_active_map) { - do_partition_search = 0; - if (mi_row + (mi_step >> 1) < cm->mi_rows && - mi_col + (mi_step >> 1) < cm->mi_cols) { - *(get_sb_partitioning(x, bsize)) = bsize; - bs_type = mi_8x8[0]->mbmi.sb_type = bsize; - subsize = bsize; - partition = PARTITION_NONE; - } - } if (do_partition_search && cpi->sf.partition_search_type == SEARCH_PARTITION && cpi->sf.adjust_partitioning_from_last_frame) { @@ -1809,44 +1485,41 @@ static void rd_use_partition(VP9_COMP *cpi, if (partition != PARTITION_NONE && !splits_below && mi_row + (mi_step >> 1) < cm->mi_rows && mi_col + (mi_step >> 1) < cm->mi_cols) { - *(get_sb_partitioning(x, bsize)) = bsize; + pc_tree->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, - get_block_context(x, bsize), INT64_MAX); + ctx, INT64_MAX, 0); pl = partition_plane_context(xd, mi_row, mi_col, bsize); if (none_rate < INT_MAX) { - none_rate += x->partition_cost[pl][PARTITION_NONE]; + none_rate += cpi->partition_cost[pl][PARTITION_NONE]; none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist); } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); mi_8x8[0]->mbmi.sb_type = bs_type; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = partition; } } switch (partition) { case PARTITION_NONE: rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, - &last_part_dist, bsize, - get_block_context(x, bsize), INT64_MAX); + &last_part_dist, bsize, ctx, INT64_MAX, 0); break; case PARTITION_HORZ: - *get_sb_index(x, subsize) = 0; rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, - &last_part_dist, subsize, - get_block_context(x, subsize), INT64_MAX); + &last_part_dist, subsize, &pc_tree->horizontal[0], + INT64_MAX, 0); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) { int rt = 0; int64_t dt = 0; - update_state(cpi, get_block_context(x, subsize), mi_row, mi_col, - subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(x, subsize) = 1; + PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + update_state(cpi, ctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &rt, &dt, - subsize, get_block_context(x, subsize), INT64_MAX); + subsize, &pc_tree->horizontal[1], INT64_MAX, 1); if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; last_part_dist = INT64_MAX; @@ -1858,20 +1531,19 @@ static void rd_use_partition(VP9_COMP *cpi, } break; case PARTITION_VERT: - *get_sb_index(x, subsize) = 0; rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, - &last_part_dist, subsize, - get_block_context(x, subsize), INT64_MAX); + &last_part_dist, subsize, &pc_tree->vertical[0], + INT64_MAX, 0); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { int rt = 0; int64_t dt = 0; - update_state(cpi, get_block_context(x, subsize), mi_row, mi_col, - subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(x, subsize) = 1; + PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0]; + update_state(cpi, ctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &rt, &dt, - subsize, get_block_context(x, subsize), INT64_MAX); + subsize, &pc_tree->vertical[bsize > BLOCK_8X8], + INT64_MAX, 1); if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; last_part_dist = INT64_MAX; @@ -1882,7 +1554,12 @@ static void rd_use_partition(VP9_COMP *cpi, } break; case PARTITION_SPLIT: - // Split partition. + if (bsize == BLOCK_8X8) { + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, pc_tree->leaf_split[0], + INT64_MAX, 0); + break; + } last_part_rate = 0; last_part_dist = 0; for (i = 0; i < 4; i++) { @@ -1895,11 +1572,9 @@ static void rd_use_partition(VP9_COMP *cpi, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(x, subsize) = i; - rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, - i != 3); + i != 3, pc_tree->split[i]); if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; last_part_dist = INT64_MAX; @@ -1911,11 +1586,12 @@ static void rd_use_partition(VP9_COMP *cpi, break; default: assert(0); + break; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); if (last_part_rate < INT_MAX) { - last_part_rate += x->partition_cost[pl][partition]; + last_part_rate += cpi->partition_cost[pl][partition]; last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist); } @@ -1931,6 +1607,7 @@ static void rd_use_partition(VP9_COMP *cpi, chosen_rate = 0; chosen_dist = 0; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + pc_tree->partitioning = PARTITION_SPLIT; // Split partition. for (i = 0; i < 4; i++) { @@ -1944,15 +1621,11 @@ static void rd_use_partition(VP9_COMP *cpi, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(x, split_subsize) = i; - *get_sb_partitioning(x, bsize) = split_subsize; - *get_sb_partitioning(x, split_subsize) = split_subsize; - save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - + pc_tree->split[i]->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, - split_subsize, get_block_context(x, split_subsize), - INT64_MAX); + split_subsize, &pc_tree->split[i]->none, + INT64_MAX, i); restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1967,32 +1640,32 @@ static void rd_use_partition(VP9_COMP *cpi, if (i != 3) encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0, - split_subsize); + split_subsize, pc_tree->split[i]); pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, split_subsize); - chosen_rate += x->partition_cost[pl][PARTITION_NONE]; + chosen_rate += cpi->partition_cost[pl][PARTITION_NONE]; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); if (chosen_rate < INT_MAX) { - chosen_rate += x->partition_cost[pl][PARTITION_SPLIT]; + chosen_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist); } } - // If last_part is better set the partitioning to that... + // If last_part is better set the partitioning to that. if (last_part_rd < chosen_rd) { mi_8x8[0]->mbmi.sb_type = bsize; if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = partition; chosen_rate = last_part_rate; chosen_dist = last_part_dist; chosen_rd = last_part_rd; } - // If none was better set the partitioning to that... + // If none was better set the partitioning to that. if (none_rd < chosen_rd) { if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + pc_tree->partitioning = PARTITION_NONE; chosen_rate = none_rate; chosen_dist = none_dist; } @@ -2018,8 +1691,8 @@ static void rd_use_partition(VP9_COMP *cpi, if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, chosen_rate, chosen_dist); - - encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize); + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, + pc_tree); } *rate = chosen_rate; @@ -2049,10 +1722,9 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { // // The min and max are assumed to have been initialized prior to calling this // function so repeat calls can accumulate a min and max of more than one sb64. -static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, - BLOCK_SIZE * min_block_size, - BLOCK_SIZE * max_block_size ) { - MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size ) { int sb_width_in_blocks = MI_BLOCK_SIZE; int sb_height_in_blocks = MI_BLOCK_SIZE; int i, j; @@ -2087,15 +1759,11 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, BLOCK_SIZE *max_block_size) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MODE_INFO **mi_8x8 = xd->mi; - const int left_in_image = xd->left_available && mi_8x8[-1]; - const int above_in_image = xd->up_available && - mi_8x8[-xd->mi_stride]; - MODE_INFO **above_sb64_mi_8x8; - MODE_INFO **left_sb64_mi_8x8; - - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; + MODE_INFO **mi = xd->mi; + const int left_in_image = xd->left_available && mi[-1]; + const int above_in_image = xd->up_available && mi[-xd->mi_stride]; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; int bh, bw; BLOCK_SIZE min_size = BLOCK_4X4; BLOCK_SIZE max_size = BLOCK_64X64; @@ -2111,19 +1779,17 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, if (cm->frame_type != KEY_FRAME) { MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; - get_sb_partition_size_range(cpi, prev_mi, &min_size, &max_size); + get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size); } // Find the min and max partition sizes used in the left SB64 if (left_in_image) { - left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, left_sb64_mi_8x8, - &min_size, &max_size); + MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE]; + get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size); } // Find the min and max partition sizes used in the above SB64. if (above_in_image) { - above_sb64_mi_8x8 = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, above_sb64_mi_8x8, - &min_size, &max_size); + MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE]; + get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size); } // adjust observed min and max if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) { @@ -2149,6 +1815,121 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, *max_block_size = max_size; } +static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + MODE_INFO **mi_8x8 = xd->mi; + const int left_in_image = xd->left_available && mi_8x8[-1]; + const int above_in_image = xd->up_available && + mi_8x8[-xd->mi_stride]; + int row8x8_remaining = tile->mi_row_end - mi_row; + int col8x8_remaining = tile->mi_col_end - mi_col; + int bh, bw; + BLOCK_SIZE min_size = BLOCK_32X32; + BLOCK_SIZE max_size = BLOCK_8X8; + int bsl = mi_width_log2(BLOCK_64X64); + const int search_range_ctrl = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + // Trap case where we do not have a prediction. + if (search_range_ctrl && + (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) { + int block; + MODE_INFO **mi; + BLOCK_SIZE sb_type; + + // Find the min and max partition sizes used in the left SB64. + if (left_in_image) { + MODE_INFO *cur_mi; + mi = &mi_8x8[-1]; + for (block = 0; block < MI_BLOCK_SIZE; ++block) { + cur_mi = mi[block * xd->mi_stride]; + sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0; + min_size = MIN(min_size, sb_type); + max_size = MAX(max_size, sb_type); + } + } + // Find the min and max partition sizes used in the above SB64. + if (above_in_image) { + mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE]; + for (block = 0; block < MI_BLOCK_SIZE; ++block) { + sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0; + min_size = MIN(min_size, sb_type); + max_size = MAX(max_size, sb_type); + } + } + + min_size = min_partition_size[min_size]; + max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining, + &bh, &bw); + min_size = MIN(min_size, max_size); + min_size = MAX(min_size, BLOCK_8X8); + max_size = MIN(max_size, BLOCK_32X32); + } else { + min_size = BLOCK_8X8; + max_size = BLOCK_32X32; + } + + *min_block_size = min_size; + *max_block_size = max_size; +} + +// TODO(jingning) refactor functions setting partition search range +static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) { + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + int idx, idy; + + MODE_INFO *mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + BLOCK_SIZE bs, min_size, max_size; + + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; + + if (prev_mi) { + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + } + + if (xd->left_available) { + for (idy = 0; idy < mi_height; ++idy) { + mi = xd->mi[idy * cm->mi_stride - 1]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (xd->up_available) { + for (idx = 0; idx < mi_width; ++idx) { + mi = xd->mi[idx - cm->mi_stride]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (min_size == max_size) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + + *min_bs = min_size; + *max_bs = max_size; +} + static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } @@ -2157,13 +1938,59 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } +#if CONFIG_FP_MB_STATS +const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4}; +const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4}; +const int qindex_skip_threshold_lookup[BLOCK_SIZES] = + {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120}; +const int qindex_split_threshold_lookup[BLOCK_SIZES] = + {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120}; +const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6}; + +typedef enum { + MV_ZERO = 0, + MV_LEFT = 1, + MV_UP = 2, + MV_RIGHT = 3, + MV_DOWN = 4, + MV_INVALID +} MOTION_DIRECTION; + +static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { + if (fp_byte & FPMB_MOTION_ZERO_MASK) { + return MV_ZERO; + } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { + return MV_LEFT; + } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { + return MV_RIGHT; + } else if (fp_byte & FPMB_MOTION_UP_MASK) { + return MV_UP; + } else { + return MV_DOWN; + } +} + +static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, + MOTION_DIRECTION that_mv) { + if (this_mv == that_mv) { + return 0; + } else { + return abs(this_mv - that_mv) == 2 ? 2 : 1; + } +} +#endif + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, - int64_t *dist, int do_recon, int64_t best_rd) { + int64_t *dist, int64_t best_rd, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -2171,7 +1998,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; - PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize); + PICK_MODE_CONTEXT *ctx = &pc_tree->none; int i, pl; BLOCK_SIZE subsize; int this_rate, sum_rate = 0, best_rate = INT_MAX; @@ -2179,12 +2006,21 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, int64_t sum_rd = 0; int do_split = bsize >= BLOCK_8X8; int do_rect = 1; + // Override skipping rectangular partition operations for edge blocks const int force_horz_split = (mi_row + mi_step >= cm->mi_rows); const int force_vert_split = (mi_col + mi_step >= cm->mi_cols); const int xss = x->e_mbd.plane[1].subsampling_x; const int yss = x->e_mbd.plane[1].subsampling_y; + BLOCK_SIZE min_size = cpi->sf.min_partition_size; + BLOCK_SIZE max_size = cpi->sf.max_partition_size; + +#if CONFIG_FP_MB_STATS + unsigned int src_diff_var = UINT_MAX; + int none_complexity = 0; +#endif + int partition_none_allowed = !force_horz_split && !force_vert_split; int partition_horz_allowed = !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; @@ -2192,37 +2028,31 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, bsize >= BLOCK_8X8; (void) *tp_orig; - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index != 0) { - *rate = 0; - *dist = 0; - return; - } - } assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - if (bsize == BLOCK_16X16) { - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); + + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) x->mb_energy = vp9_block_energy(cpi, x, bsize); - } else { - x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize); + + if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { + int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + + if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) + set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); } // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size); - partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); - partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) || force_vert_split); - do_split &= bsize > cpi->sf.min_partition_size; + do_split &= bsize > min_size; } if (cpi->sf.use_square_partition_only) { partition_horz_allowed &= force_horz_split; @@ -2234,7 +2064,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.disable_split_var_thresh && partition_none_allowed) { unsigned int source_variancey; vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); - source_variancey = get_sby_perpixel_variance(cpi, x, bsize); + source_variancey = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); if (source_variancey < cpi->sf.disable_split_var_thresh) { do_split = 0; if (source_variancey < cpi->sf.disable_split_var_thresh / 2) @@ -2242,18 +2072,76 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } - if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed)) - do_split = 0; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, + mi_row, mi_col, bsize); + } +#endif + +#if CONFIG_FP_MB_STATS + // Decide whether we shall split directly and skip searching NONE by using + // the first pass block statistics + if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split && + partition_none_allowed && src_diff_var > 4 && + cm->base_qindex < qindex_split_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + // compute a complexity measure, basically measure inconsistency of motion + // vectors obtained from the first pass in the current block + for (r = mb_row; r < mb_row_end ; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + + MOTION_DIRECTION this_mv; + MOTION_DIRECTION right_mv; + MOTION_DIRECTION bottom_mv; + + this_mv = + get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); + + // to its right + if (c != mb_col_end - 1) { + right_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + 1]); + none_complexity += get_motion_inconsistency(this_mv, right_mv); + } + + // to its bottom + if (r != mb_row_end - 1) { + bottom_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); + none_complexity += get_motion_inconsistency(this_mv, bottom_mv); + } + + // do not count its left and top neighbors to avoid double counting + } + } + + if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { + partition_none_allowed = 0; + } + } +#endif + // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, - ctx, best_rd); + ctx, best_rd, 0); if (this_rate != INT_MAX) { if (bsize >= BLOCK_8X8) { pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += x->partition_cost[pl][PARTITION_NONE]; + this_rate += cpi->partition_cost[pl][PARTITION_NONE]; } sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); + if (sum_rd < best_rd) { int64_t stop_thresh = 4096; int64_t stop_thresh_rd; @@ -2262,11 +2150,11 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, best_dist = this_dist; best_rd = sum_rd; if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + pc_tree->partitioning = PARTITION_NONE; // Adjust threshold according to partition size. - stop_thresh >>= 8 - (b_width_log2_lookup[bsize] + - b_height_log2_lookup[bsize]); + stop_thresh >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh); // If obtained distortion is very small, choose current partition @@ -2275,12 +2163,54 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, do_split = 0; do_rect = 0; } + +#if CONFIG_FP_MB_STATS + // Check if every 16x16 first pass block statistics has zero + // motion and the corresponding first pass residue is small enough. + // If that is the case, check the difference variance between the + // current frame and the last frame. If the variance is small enough, + // stop further splitting in RD optimization + if (cpi->use_fp_mb_stats && do_split != 0 && + cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + int skip = 1; + for (r = mb_row; r < mb_row_end; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + if (!(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_MOTION_ZERO_MASK) || + !(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_ERROR_SMALL_MASK)) { + skip = 0; + break; + } + } + if (skip == 0) { + break; + } + } + if (skip) { + if (src_diff_var == UINT_MAX) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance( + cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize); + } + if (src_diff_var < 8) { + do_split = 0; + do_rect = 0; + } + } + } +#endif } } - if (!x->in_active_map) { - do_split = 0; - do_rect = 0; - } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } @@ -2294,40 +2224,53 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // the starting point of motion search in the following partition type check. if (do_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); - for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + if (bsize == BLOCK_8X8) { + i = 4; + if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) + pc_tree->leaf_split[0]->pred_interp_filter = + ctx->mic.mbmi.interp_filter; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + pc_tree->leaf_split[0], best_rd, 0); + if (sum_rate == INT_MAX) + sum_rd = INT64_MAX; + else + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } else { + for (i = 0; i < 4 && sum_rd < best_rd; ++i) { const int x_idx = (i & 1) * mi_step; const int y_idx = (i >> 1) * mi_step; - if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) - continue; + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; - *get_sb_index(x, subsize) = i; - if (cpi->sf.adaptive_motion_search) - load_pred_mv(x, ctx); - if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) - get_block_context(x, subsize)->pred_interp_filter = - ctx->mic.mbmi.interp_filter; - rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, - &this_rate, &this_dist, i != 3, best_rd - sum_rd); + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, ctx); - if (this_rate == INT_MAX) { - sum_rd = INT64_MAX; - } else { - sum_rate += this_rate; - sum_dist += this_dist; - sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + pc_tree->split[i]->index = i; + rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, + subsize, &this_rate, &this_dist, + best_rd - sum_rd, pc_tree->split[i]); + + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } } } + if (sum_rd < best_rd && i == 4) { pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_SPLIT]; + sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd) { best_rate = sum_rate; best_dist = sum_dist; best_rd = sum_rd; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_SPLIT; } } else { // skip rectangular partition test when larger block size @@ -2341,32 +2284,30 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) - get_block_context(x, subsize)->pred_interp_filter = + pc_tree->horizontal[0].pred_interp_filter = ctx->mic.mbmi.interp_filter; rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + &pc_tree->horizontal[0], best_rd, 0); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_row + mi_step < cm->mi_rows) { - update_state(cpi, get_block_context(x, subsize), mi_row, mi_col, - subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + update_state(cpi, ctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); - *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) - get_block_context(x, subsize)->pred_interp_filter = + pc_tree->horizontal[1].pred_interp_filter = ctx->mic.mbmi.interp_filter; rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + &this_dist, subsize, &pc_tree->horizontal[1], + best_rd - sum_rd, 1); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -2377,47 +2318,45 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } if (sum_rd < best_rd) { pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_HORZ]; + sum_rate += cpi->partition_cost[pl][PARTITION_HORZ]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { best_rd = sum_rd; best_rate = sum_rate; best_dist = sum_dist; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_HORZ; } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - // PARTITION_VERT if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) - get_block_context(x, subsize)->pred_interp_filter = + pc_tree->vertical[0].pred_interp_filter = ctx->mic.mbmi.interp_filter; rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + &pc_tree->vertical[0], best_rd, 0); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + mi_step < cm->mi_cols) { - update_state(cpi, get_block_context(x, subsize), mi_row, mi_col, - subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, + &pc_tree->vertical[0]); - *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) - get_block_context(x, subsize)->pred_interp_filter = + pc_tree->vertical[1].pred_interp_filter = ctx->mic.mbmi.interp_filter; rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + &this_dist, subsize, + &pc_tree->vertical[1], best_rd - sum_rd, + 1); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -2428,13 +2367,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } if (sum_rd < best_rd) { pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_VERT]; + sum_rate += cpi->partition_cost[pl][PARTITION_VERT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { best_rate = sum_rate; best_dist = sum_dist; best_rd = sum_rd; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_VERT; } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2448,23 +2387,22 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, *rate = best_rate; *dist = best_dist; - if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) { + if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); // Check the projected output rate for this SB against it's target // and and if necessary apply a Q delta using segmentation to get // closer to the target. - if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { + if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate); - } - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, best_rate, best_dist); - encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize); + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); } + if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); assert(best_rate < INT_MAX); @@ -2491,22 +2429,22 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int dummy_rate; int64_t dummy_dist; - BLOCK_SIZE i; - MACROBLOCK *x = &cpi->mb; + int i; if (sf->adaptive_pred_interp_filter) { - for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; - const int num_4x4_h = num_4x4_blocks_high_lookup[i]; - const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) - for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) - for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) - get_block_context(x, i)->pred_interp_filter = SWITCHABLE; + for (i = 0; i < 64; ++i) + cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE; + + for (i = 0; i < 64; ++i) { + cpi->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; + cpi->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; + cpi->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; + cpi->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE; } } vp9_zero(cpi->mb.pred_mv); + cpi->pc_root->index = 0; if ((sf->partition_search_type == SEARCH_PARTITION && sf->use_lastframe_partitioning) || @@ -2514,36 +2452,44 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, sf->partition_search_type == VAR_BASED_PARTITION || sf->partition_search_type == VAR_BASED_FIXED_PARTITION) { const int idx_str = cm->mi_stride * mi_row + mi_col; - MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; - MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str; cpi->mb.source_variance = UINT_MAX; if (sf->partition_search_type == FIXED_PARTITION) { set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, + set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, sf->always_this_block_size); - rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); - } else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) { + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); + } else if (cpi->skippable_frame || + sf->partition_search_type == VAR_BASED_FIXED_PARTITION) { BLOCK_SIZE bsize; set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col); - set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize); - rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); + set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); } else if (sf->partition_search_type == VAR_BASED_PARTITION) { choose_partitioning(cpi, tile, mi_row, mi_col); - rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); } else { - if ((cm->current_video_frame + GF_GROUP * gf_grp = &cpi->twopass.gf_group; + int last_was_mid_sequence_overlay = 0; + if ((cpi->oxcf.pass == 2) && (gf_grp->index)) { + if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE) + last_was_mid_sequence_overlay = 1; + } + if ((cpi->rc.frames_since_key % sf->last_partitioning_redo_frequency) == 0 + || last_was_mid_sequence_overlay || cm->prev_mi == 0 || cm->show_frame == 0 || cm->frame_type == KEY_FRAME || cpi->rc.is_src_frame_alt_ref || ((sf->use_lastframe_partitioning == LAST_FRAME_PARTITION_LOW_MOTION) && - sb_has_motion(cm, prev_mi_8x8))) { + sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) { // If required set upper and lower partition size limits if (sf->auto_min_max_partition_size) { set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); @@ -2552,16 +2498,17 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX); + &dummy_rate, &dummy_dist, INT64_MAX, + cpi->pc_root); } else { if (sf->constrain_copy_partition && - sb_has_motion(cm, prev_mi_8x8)) - constrain_copy_partitioning(cpi, tile, mi_8x8, prev_mi_8x8, + sb_has_motion(cm, prev_mi, sf->lf_motion_threshold)) + constrain_copy_partitioning(cpi, tile, mi, prev_mi, mi_row, mi_col, BLOCK_16X16); else - copy_partitioning(cm, mi_8x8, prev_mi_8x8); - rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); + copy_partitioning(cm, mi, prev_mi); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); } } } else { @@ -2573,7 +2520,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX); + &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root); } } } @@ -2584,21 +2531,11 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - x->act_zbin_adj = 0; - // Copy data over into macro block data structures. vp9_setup_src_planes(x, cpi->Source, 0, 0); - // TODO(jkoleszar): are these initializations required? - vp9_setup_pre_planes(xd, 0, get_ref_frame_buffer(cpi, LAST_FRAME), 0, 0, - NULL); - vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0); - vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); - xd->mi[0]->mbmi.mode = DC_PRED; - xd->mi[0]->mbmi.uv_mode = DC_PRED; - // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. vpx_memset(xd->above_context[0], 0, @@ -2608,22 +2545,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { sizeof(*xd->above_seg_context) * aligned_mi_cols); } -static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { - if (lossless) { - // printf("Switching to lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_fwht4x4; - cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; - cpi->mb.optimize = 0; - cpi->common.lf.filter_level = 0; - cpi->zbin_mode_boost_enabled = 0; - cpi->common.tx_mode = ONLY_4X4; - } else { - // printf("Not lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_fdct4x4; - cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; - } -} - static int check_dual_ref_flags(VP9_COMP *cpi) { const int ref_flags = cpi->ref_frame_flags; @@ -2635,15 +2556,15 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { } } -static void reset_skip_txfm_size(VP9_COMMON *cm, TX_SIZE txfm_max) { +static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) { int mi_row, mi_col; const int mis = cm->mi_stride; MODE_INFO **mi_ptr = cm->mi_grid_visible; for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) { for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if (mi_ptr[mi_col]->mbmi.tx_size > txfm_max) - mi_ptr[mi_col]->mbmi.tx_size = txfm_max; + if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size) + mi_ptr[mi_col]->mbmi.tx_size = max_tx_size; } } } @@ -2654,92 +2575,56 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) return ALTREF_FRAME; else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) - return LAST_FRAME; - else return GOLDEN_FRAME; + else + return LAST_FRAME; } static TX_MODE select_tx_mode(const VP9_COMP *cpi) { - if (cpi->oxcf.lossless) { + if (cpi->mb.e_mbd.lossless) return ONLY_4X4; - } else if (cpi->common.current_video_frame == 0) { + if (cpi->common.frame_type == KEY_FRAME) return TX_MODE_SELECT; - } else { - if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { - return ALLOW_32X32; - } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) { - const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi); - return cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] > - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? - ALLOW_32X32 : TX_MODE_SELECT; - } else { - unsigned int total = 0; - int i; - for (i = 0; i < TX_SIZES; ++i) - total += cpi->tx_stepdown_count[i]; - - if (total) { - const double fraction = (double)cpi->tx_stepdown_count[0] / total; - return fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; - } else { - return cpi->common.tx_mode; - } - } - } -} - -// Start RTC Exploration -typedef enum { - BOTH_ZERO = 0, - ZERO_PLUS_PREDICTED = 1, - BOTH_PREDICTED = 2, - NEW_PLUS_NON_INTRA = 3, - BOTH_NEW = 4, - INTRA_PLUS_NON_INTRA = 5, - BOTH_INTRA = 6, - INVALID_CASE = 9 -} motion_vector_context; - -static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, - MB_PREDICTION_MODE mode) { - mbmi->mode = mode; - mbmi->uv_mode = mode; - mbmi->mv[0].as_int = 0; - mbmi->mv[1].as_int = 0; - mbmi->ref_frame[0] = INTRA_FRAME; - mbmi->ref_frame[1] = NONE; - mbmi->tx_size = max_txsize_lookup[bsize]; - mbmi->skip = 0; - mbmi->sb_type = bsize; - mbmi->segment_id = 0; + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) + return ALLOW_32X32; + else if (cpi->sf.tx_size_search_method == USE_FULL_RD|| + cpi->sf.tx_size_search_method == USE_TX_8X8) + return TX_MODE_SELECT; + else + return cpi->common.tx_mode; } static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, int mi_col, int *rate, int64_t *dist, - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; set_offsets(cpi, tile, mi_row, mi_col, bsize); - xd->mi[0]->mbmi.sb_type = bsize; + mbmi = &xd->mi[0]->mbmi; + mbmi->sb_type = bsize; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) + if (mbmi->segment_id && x->in_static_area) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + + if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize); + else + vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx); - if (!frame_is_intra_only(cm)) { - vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, - rate, dist, bsize); - } else { - MB_PREDICTION_MODE intramode = DC_PRED; - set_mode_info(&xd->mi[0]->mbmi, bsize, intramode); - } duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); } static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row, int mi_col, - BLOCK_SIZE bsize, BLOCK_SIZE subsize) { + BLOCK_SIZE bsize, BLOCK_SIZE subsize, + PC_TREE *pc_tree) { MACROBLOCKD *xd = &x->e_mbd; int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; - PARTITION_TYPE partition = partition_lookup[bsl][subsize]; + PARTITION_TYPE partition = pc_tree->partitioning; assert(bsize >= BLOCK_8X8); @@ -2749,48 +2634,42 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, switch (partition) { case PARTITION_NONE: set_modeinfo_offsets(cm, xd, mi_row, mi_col); - *(xd->mi[0]) = get_block_context(x, subsize)->mic; + *(xd->mi[0]) = pc_tree->none.mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); break; case PARTITION_VERT: - *get_sb_index(x, subsize) = 0; set_modeinfo_offsets(cm, xd, mi_row, mi_col); - *(xd->mi[0]) = get_block_context(x, subsize)->mic; + *(xd->mi[0]) = pc_tree->vertical[0].mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); if (mi_col + hbs < cm->mi_cols) { - *get_sb_index(x, subsize) = 1; set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs); - *(xd->mi[0]) = get_block_context(x, subsize)->mic; + *(xd->mi[0]) = pc_tree->vertical[1].mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize); } break; case PARTITION_HORZ: - *get_sb_index(x, subsize) = 0; set_modeinfo_offsets(cm, xd, mi_row, mi_col); - *(xd->mi[0]) = get_block_context(x, subsize)->mic; + *(xd->mi[0]) = pc_tree->horizontal[0].mic; duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); if (mi_row + hbs < cm->mi_rows) { - *get_sb_index(x, subsize) = 1; set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col); - *(xd->mi[0]) = get_block_context(x, subsize)->mic; + *(xd->mi[0]) = pc_tree->horizontal[1].mic; duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize); } break; - case PARTITION_SPLIT: - *get_sb_index(x, subsize) = 0; + case PARTITION_SPLIT: { + BLOCK_SIZE subsubsize = get_subsize(subsize, PARTITION_SPLIT); fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, - *(get_sb_partitioning(x, subsize))); - *get_sb_index(x, subsize) = 1; + subsubsize, pc_tree->split[0]); fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize, - *(get_sb_partitioning(x, subsize))); - *get_sb_index(x, subsize) = 2; + subsubsize, pc_tree->split[1]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize, - *(get_sb_partitioning(x, subsize))); - *get_sb_index(x, subsize) = 3; + subsubsize, pc_tree->split[2]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize, - *(get_sb_partitioning(x, subsize))); + subsubsize, pc_tree->split[3]); break; + } default: break; } @@ -2799,15 +2678,18 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, - int64_t *dist, int do_recon, int64_t best_rd) { + int64_t *dist, int do_recon, int64_t best_rd, + PC_TREE *pc_tree) { + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; TOKENEXTRA *tp_orig = *tp; - PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize); + PICK_MODE_CONTEXT *ctx = &pc_tree->none; int i; - BLOCK_SIZE subsize; + BLOCK_SIZE subsize = bsize; int this_rate, sum_rate = 0, best_rate = INT_MAX; int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX; int64_t sum_rd = 0; @@ -2826,51 +2708,38 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, bsize >= BLOCK_8X8; (void) *tp_orig; - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index != 0) { - *rate = 0; - *dist = 0; - return; - } - } - assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize); - // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. - if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size); - partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + if (sf->auto_min_max_partition_size) { + partition_none_allowed &= (bsize <= sf->max_partition_size && + bsize >= sf->min_partition_size); + partition_horz_allowed &= ((bsize <= sf->max_partition_size && + bsize > sf->min_partition_size) || force_horz_split); - partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_vert_allowed &= ((bsize <= sf->max_partition_size && + bsize > sf->min_partition_size) || force_vert_split); - do_split &= bsize > cpi->sf.min_partition_size; + do_split &= bsize > sf->min_partition_size; } - if (cpi->sf.use_square_partition_only) { + if (sf->use_square_partition_only) { partition_horz_allowed &= force_horz_split; partition_vert_allowed &= force_vert_split; } - if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed)) - do_split = 0; - // PARTITION_NONE if (partition_none_allowed) { nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, - &this_rate, &this_dist, bsize); + &this_rate, &this_dist, bsize, ctx); ctx->mic.mbmi = xd->mi[0]->mbmi; + ctx->skip_txfm[0] = x->skip_txfm[0]; + ctx->skip = x->skip; if (this_rate != INT_MAX) { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += x->partition_cost[pl][PARTITION_NONE]; + this_rate += cpi->partition_cost[pl][PARTITION_NONE]; sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); if (sum_rd < best_rd) { int64_t stop_thresh = 4096; @@ -2880,11 +2749,11 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, best_dist = this_dist; best_rd = sum_rd; if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + pc_tree->partitioning = PARTITION_NONE; // Adjust threshold according to partition size. - stop_thresh >>= 8 - (b_width_log2_lookup[bsize] + - b_height_log2_lookup[bsize]); + stop_thresh >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh); // If obtained distortion is very small, choose current partition @@ -2895,10 +2764,6 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } } - if (!x->in_active_map) { - do_split = 0; - do_rect = 0; - } } // store estimated motion vector @@ -2908,7 +2773,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, sum_rd = 0; if (do_split) { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_SPLIT]; + sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; subsize = get_subsize(bsize, PARTITION_SPLIT); for (i = 0; i < 4 && sum_rd < best_rd; ++i) { const int x_idx = (i & 1) * ms; @@ -2916,13 +2781,10 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - - *get_sb_index(x, subsize) = i; load_pred_mv(x, ctx); - nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rate, &this_dist, 0, - best_rd - sum_rd); + best_rd - sum_rd, pc_tree->split[i]); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; @@ -2937,11 +2799,11 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, best_rate = sum_rate; best_dist = sum_dist; best_rd = sum_rd; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_SPLIT; } else { // skip rectangular partition test when larger block size // gives better rd cost - if (cpi->sf.less_rectangular_check) + if (sf->less_rectangular_check) do_rect &= !partition_none_allowed; } } @@ -2949,32 +2811,34 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - *get_sb_index(x, subsize) = 0; - if (cpi->sf.adaptive_motion_search) + if (sf->adaptive_motion_search) load_pred_mv(x, ctx); nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, - &this_rate, &this_dist, subsize); + &this_rate, &this_dist, subsize, + &pc_tree->horizontal[0]); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { - *get_sb_index(x, subsize) = 1; - load_pred_mv(x, ctx); - nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, - &this_rate, &this_dist, subsize); + &this_rate, &this_dist, subsize, + &pc_tree->horizontal[1]); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += x->partition_cost[pl][PARTITION_HORZ]; + this_rate += cpi->partition_cost[pl][PARTITION_HORZ]; sum_rate += this_rate; sum_dist += this_dist; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); @@ -2984,7 +2848,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, best_rd = sum_rd; best_rate = sum_rate; best_dist = sum_dist; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_HORZ; } } @@ -2992,29 +2856,29 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - *get_sb_index(x, subsize) = 0; - if (cpi->sf.adaptive_motion_search) + if (sf->adaptive_motion_search) load_pred_mv(x, ctx); nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, - &this_rate, &this_dist, subsize); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + &this_rate, &this_dist, subsize, + &pc_tree->vertical[0]); + pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { - *get_sb_index(x, subsize) = 1; - load_pred_mv(x, ctx); - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, - &this_rate, &this_dist, subsize); - - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; - + &this_rate, &this_dist, subsize, + &pc_tree->vertical[1]); + pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += x->partition_cost[pl][PARTITION_VERT]; + this_rate += cpi->partition_cost[pl][PARTITION_VERT]; sum_rate += this_rate; sum_dist += this_dist; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); @@ -3024,9 +2888,13 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, best_rate = sum_rate; best_dist = sum_dist; best_rd = sum_rd; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_VERT; } } + // TODO(JBB): The following line is here just to avoid a static warning + // that occurs because at this point we never again reuse best_rd + // despite setting it here. The code should be refactored to avoid this. + (void) best_rd; *rate = best_rate; *dist = best_dist; @@ -3035,8 +2903,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, return; // update mode info array - fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, - *(get_sb_partitioning(x, bsize))); + subsize = get_subsize(bsize, pc_tree->partitioning); + fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize, + pc_tree); if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) { int output_enabled = (bsize == BLOCK_64X64); @@ -3044,16 +2913,16 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // Check the projected output rate for this SB against it's target // and and if necessary apply a Q delta using segmentation to get // closer to the target. - if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { + if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate); } - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + if (oxcf->aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, best_rate, best_dist); - encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize); + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); } if (bsize == BLOCK_64X64) { @@ -3067,11 +2936,12 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, static void nonrd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, + MODE_INFO **mi, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int output_enabled, - int *totrate, int64_t *totdist) { + int *totrate, int64_t *totdist, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -3085,23 +2955,29 @@ static void nonrd_use_partition(VP9_COMP *cpi, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - subsize = (bsize >= BLOCK_8X8) ? mi_8x8[0]->mbmi.sb_type : BLOCK_4X4; + subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4; partition = partition_lookup[bsl][subsize]; switch (partition) { case PARTITION_NONE: - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + subsize, &pc_tree->none); + pc_tree->none.mic.mbmi = xd->mi[0]->mbmi; + pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; + pc_tree->none.skip = x->skip; break; case PARTITION_VERT: - *get_sb_index(x, subsize) = 0; - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + subsize, &pc_tree->vertical[0]); + pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; if (mi_col + hbs < cm->mi_cols) { - *get_sb_index(x, subsize) = 1; nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs, - &rate, &dist, subsize); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + &rate, &dist, subsize, &pc_tree->vertical[1]); + pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; @@ -3110,14 +2986,17 @@ static void nonrd_use_partition(VP9_COMP *cpi, } break; case PARTITION_HORZ: - *get_sb_index(x, subsize) = 0; - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize); - get_block_context(x, subsize)->mic.mbmi = xd->mi[0]->mbmi; + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; if (mi_row + hbs < cm->mi_rows) { - *get_sb_index(x, subsize) = 1; nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col, - &rate, &dist, subsize); - get_block_context(x, subsize)->mic.mbmi = mi_8x8[0]->mbmi; + &rate, &dist, subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; @@ -3127,31 +3006,28 @@ static void nonrd_use_partition(VP9_COMP *cpi, break; case PARTITION_SPLIT: subsize = get_subsize(bsize, PARTITION_SPLIT); - *get_sb_index(x, subsize) = 0; - nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, - subsize, output_enabled, totrate, totdist); - *get_sb_index(x, subsize) = 1; - nonrd_use_partition(cpi, tile, mi_8x8 + hbs, tp, + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, + subsize, output_enabled, totrate, totdist, + pc_tree->split[0]); + nonrd_use_partition(cpi, tile, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, - &rate, &dist); + &rate, &dist, pc_tree->split[1]); if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; *totdist += dist; } - *get_sb_index(x, subsize) = 2; - nonrd_use_partition(cpi, tile, mi_8x8 + hbs * mis, tp, + nonrd_use_partition(cpi, tile, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - &rate, &dist); + &rate, &dist, pc_tree->split[2]); if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; *totdist += dist; } - *get_sb_index(x, subsize) = 3; - nonrd_use_partition(cpi, tile, mi_8x8 + hbs * mis + hbs, tp, + nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, output_enabled, - &rate, &dist); + &rate, &dist, pc_tree->split[3]); if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; @@ -3160,20 +3036,23 @@ static void nonrd_use_partition(VP9_COMP *cpi, break; default: assert("Invalid partition type."); + break; } if (bsize == BLOCK_64X64 && output_enabled) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, *totrate, *totdist); - encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize); + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree); } } static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, TOKENEXTRA **tp) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; + SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; int mi_col; // Initialize the left context for the new SB row @@ -3186,54 +3065,206 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int dummy_rate = 0; int64_t dummy_dist = 0; const int idx_str = cm->mi_stride * mi_row + mi_col; - MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; - MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str; BLOCK_SIZE bsize; - cpi->mb.source_variance = UINT_MAX; - vp9_zero(cpi->mb.pred_mv); + x->in_static_area = 0; + x->source_variance = UINT_MAX; + vp9_zero(x->pred_mv); // Set the partition type of the 64X64 block - switch (cpi->sf.partition_search_type) { + switch (sf->partition_search_type) { case VAR_BASED_PARTITION: choose_partitioning(cpi, tile, mi_row, mi_col); - nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - 1, &dummy_rate, &dummy_dist); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + 1, &dummy_rate, &dummy_dist, cpi->pc_root); break; case SOURCE_VAR_BASED_PARTITION: - set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - set_source_var_based_partition(cpi, tile, mi_8x8, mi_row, mi_col); - nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - 1, &dummy_rate, &dummy_dist); + set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + 1, &dummy_rate, &dummy_dist, cpi->pc_root); break; case VAR_BASED_FIXED_PARTITION: case FIXED_PARTITION: - bsize = cpi->sf.partition_search_type == FIXED_PARTITION ? - cpi->sf.always_this_block_size : + bsize = sf->partition_search_type == FIXED_PARTITION ? + sf->always_this_block_size : get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col); - set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize); - nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - 1, &dummy_rate, &dummy_dist); + set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + 1, &dummy_rate, &dummy_dist, cpi->pc_root); break; case REFERENCE_PARTITION: - if (cpi->sf.partition_check || sb_has_motion(cm, prev_mi_8x8)) { + if (sf->partition_check || + !is_background(cpi, tile, mi_row, mi_col)) { + set_modeinfo_offsets(cm, xd, mi_row, mi_col); + auto_partition_range(cpi, tile, mi_row, mi_col, + &sf->min_partition_size, + &sf->max_partition_size); nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX); + &dummy_rate, &dummy_dist, 1, INT64_MAX, + cpi->pc_root); } else { - copy_partitioning(cm, mi_8x8, prev_mi_8x8); - nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, - BLOCK_64X64, 1, &dummy_rate, &dummy_dist); + copy_partitioning(cm, mi, prev_mi); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rate, &dummy_dist, + cpi->pc_root); } break; default: assert(0); + break; } } } // end RTC play code +static int set_var_thresh_from_histogram(VP9_COMP *cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9_COMMON *const cm = &cpi->common; + + const uint8_t *src = cpi->Source->y_buffer; + const uint8_t *last_src = cpi->Last_Source->y_buffer; + const int src_stride = cpi->Source->y_stride; + const int last_stride = cpi->Last_Source->y_stride; + + // Pick cutoff threshold + const int cutoff = (MIN(cm->width, cm->height) >= 720) ? + (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) : + (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); + DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS); + diff *var16 = cpi->source_diff_var; + + int sum = 0; + int i, j; + + vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0])); + + for (i = 0; i < cm->mb_rows; i++) { + for (j = 0; j < cm->mb_cols; j++) { + vp9_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + + var16->var = var16->sse - + (((uint32_t)var16->sum * var16->sum) >> 8); + + if (var16->var >= VAR_HIST_MAX_BG_VAR) + hist[VAR_HIST_BINS - 1]++; + else + hist[var16->var / VAR_HIST_FACTOR]++; + + src += 16; + last_src += 16; + var16++; + } + + src = src - cm->mb_cols * 16 + 16 * src_stride; + last_src = last_src - cm->mb_cols * 16 + 16 * last_stride; + } + + cpi->source_var_thresh = 0; + + if (hist[VAR_HIST_BINS - 1] < cutoff) { + for (i = 0; i < VAR_HIST_BINS - 1; i++) { + sum += hist[i]; + + if (sum > cutoff) { + cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR; + return 0; + } + } + } + + return sf->search_type_check_frequency; +} + +static void source_var_based_partition_search_method(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + + if (cm->frame_type == KEY_FRAME) { + // For key frame, use SEARCH_PARTITION. + sf->partition_search_type = SEARCH_PARTITION; + } else if (cm->intra_only) { + sf->partition_search_type = FIXED_PARTITION; + } else { + if (cm->last_width != cm->width || cm->last_height != cm->height) { + if (cpi->source_diff_var) + vpx_free(cpi->source_diff_var); + + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(diff))); + } + + if (!cpi->frames_till_next_var_check) + cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi); + + if (cpi->frames_till_next_var_check > 0) { + sf->partition_search_type = FIXED_PARTITION; + cpi->frames_till_next_var_check--; + } + } +} + +static int get_skip_encode_frame(const VP9_COMMON *cm) { + unsigned int intra_count = 0, inter_count = 0; + int j; + + for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { + intra_count += cm->counts.intra_inter[j][0]; + inter_count += cm->counts.intra_inter[j][1]; + } + + return (intra_count << 2) < inter_count && + cm->frame_type != KEY_FRAME && + cm->show_frame; +} + +static void encode_tiles(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + TOKENEXTRA *tok = cpi->tok; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo tile; + TOKENEXTRA *old_tok = tok; + int mi_row; + + vp9_tile_init(&tile, cm, tile_row, tile_col); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm)) + encode_nonrd_sb_row(cpi, &tile, mi_row, &tok); + else + encode_rd_sb_row(cpi, &tile, mi_row, &tok); + } + cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok); + assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); + } + } +} + +#if CONFIG_FP_MB_STATS +static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, + VP9_COMMON *cm, uint8_t **this_frame_mb_stats) { + uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start + + cm->current_video_frame * cm->MBs * sizeof(uint8_t); + + if (mb_stats_in > firstpass_mb_stats->mb_stats_end) + return EOF; + + *this_frame_mb_stats = mb_stats_in; + + return 1; +} +#endif + static void encode_frame_internal(VP9_COMP *cpi) { SPEED_FEATURES *const sf = &cpi->sf; + RD_OPT *const rd_opt = &cpi->rd; MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -3244,37 +3275,43 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cm->counts); vp9_zero(cpi->coef_counts); vp9_zero(cpi->tx_stepdown_count); - vp9_zero(cpi->rd_comp_pred_diff); - vp9_zero(cpi->rd_filter_diff); - vp9_zero(cpi->rd_tx_select_diff); - vp9_zero(cpi->rd_tx_select_threshes); + vp9_zero(rd_opt->comp_pred_diff); + vp9_zero(rd_opt->filter_diff); + vp9_zero(rd_opt->tx_select_diff); + vp9_zero(rd_opt->tx_select_threshes); + + xd->lossless = cm->base_qindex == 0 && + cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && + cm->uv_ac_delta_q == 0; cm->tx_mode = select_tx_mode(cpi); - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && - cm->y_dc_delta_q == 0 && - cm->uv_dc_delta_q == 0 && - cm->uv_ac_delta_q == 0; - switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); + x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4; + x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + + if (xd->lossless) { + x->optimize = 0; + cm->lf.filter_level = 0; + cpi->zbin_mode_boost_enabled = 0; + } vp9_frame_init_quantizer(cpi); vp9_initialize_rd_consts(cpi); vp9_initialize_me_consts(cpi, cm->base_qindex); init_encode_frame_mb_context(cpi); + set_prev_mi(cm); - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) - build_activity_map(cpi); - - cm->prev_mi = get_prev_mi(cm); - + x->quant_fp = cpi->sf.use_quant_fp; + vp9_zero(x->skip_txfm); if (sf->use_nonrd_pick_mode) { // Initialize internal buffer pointers for rtc coding, where non-RD // mode decision is used and hence no buffer pointer swap needed. int i; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context; + PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none; for (i = 0; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][0]; @@ -3284,79 +3321,28 @@ static void encode_frame_internal(VP9_COMP *cpi) { } vp9_zero(x->zcoeff_blk); - if (cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION && - cm->current_video_frame > 0) { - int check_freq = cpi->sf.search_type_check_frequency; - - if ((cm->current_video_frame - 1) % check_freq == 0) { - cpi->use_large_partition_rate = 0; - } - - if ((cm->current_video_frame - 1) % check_freq == 1) { - const int mbs_in_b32x32 = 1 << ((b_width_log2_lookup[BLOCK_32X32] - - b_width_log2_lookup[BLOCK_16X16]) + - (b_height_log2_lookup[BLOCK_32X32] - - b_height_log2_lookup[BLOCK_16X16])); - cpi->use_large_partition_rate = cpi->use_large_partition_rate * 100 * - mbs_in_b32x32 / cm->MBs; - } - - if ((cm->current_video_frame - 1) % check_freq >= 1) { - if (cpi->use_large_partition_rate < 15) - cpi->sf.partition_search_type = FIXED_PARTITION; - } - } + if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) + source_var_based_partition_search_method(cpi); } { struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); - { - // Take tiles into account and give start/end MB - int tile_col, tile_row; - TOKENEXTRA *tp = cpi->tok; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - TileInfo tile; - TOKENEXTRA *tp_old = tp; - int mi_row; - - // For each row of SBs in the frame - vp9_tile_init(&tile, cm, tile_row, tile_col); - for (mi_row = tile.mi_row_start; - mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) { - if (sf->use_nonrd_pick_mode && cm->frame_type != KEY_FRAME) - encode_nonrd_sb_row(cpi, &tile, mi_row, &tp); - else - encode_rd_sb_row(cpi, &tile, mi_row, &tp); - } - cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); - assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); - } - } - } +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm, + &cpi->twopass.this_frame_mb_stats); + } +#endif + + encode_tiles(cpi); vpx_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); } - if (sf->skip_encode_sb) { - int j; - unsigned int intra_count = 0, inter_count = 0; - for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { - intra_count += cm->counts.intra_inter[j][0]; - inter_count += cm->counts.intra_inter[j][1]; - } - sf->skip_encode_frame = (intra_count << 2) < inter_count && - cm->frame_type != KEY_FRAME && - cm->show_frame; - } else { - sf->skip_encode_frame = 0; - } + sf->skip_encode_frame = sf->skip_encode_sb ? get_skip_encode_frame(cm) : 0; #if 0 // Keep record of the total distortion this time around for future use @@ -3364,8 +3350,26 @@ static void encode_frame_internal(VP9_COMP *cpi) { #endif } +static INTERP_FILTER get_interp_filter( + const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) { + if (!is_alt_ref && + threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] && + threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] && + threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP_SMOOTH; + } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] && + threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP_SHARP; + } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP; + } else { + return SWITCHABLE; + } +} + void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a @@ -3398,59 +3402,41 @@ void vp9_encode_frame(VP9_COMP *cpi) { // that for subsequent frames. // It does the same analysis for transform size selection also. const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi); - const int64_t *mode_thresh = cpi->rd_prediction_type_threshes[frame_type]; - const int64_t *filter_thresh = cpi->rd_filter_threshes[frame_type]; + int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type]; + int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type]; + int *const tx_thrs = rd_opt->tx_select_threshes[frame_type]; + const int is_alt_ref = frame_type == ALTREF_FRAME; /* prediction (compound, single or hybrid) mode selection */ - if (frame_type == ALTREF_FRAME || !cm->allow_comp_inter_inter) + if (is_alt_ref || !cm->allow_comp_inter_inter) cm->reference_mode = SINGLE_REFERENCE; - else if (mode_thresh[COMPOUND_REFERENCE] > mode_thresh[SINGLE_REFERENCE] && - mode_thresh[COMPOUND_REFERENCE] > - mode_thresh[REFERENCE_MODE_SELECT] && + else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] && + mode_thrs[COMPOUND_REFERENCE] > + mode_thrs[REFERENCE_MODE_SELECT] && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) cm->reference_mode = COMPOUND_REFERENCE; - else if (mode_thresh[SINGLE_REFERENCE] > mode_thresh[REFERENCE_MODE_SELECT]) + else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT]) cm->reference_mode = SINGLE_REFERENCE; else cm->reference_mode = REFERENCE_MODE_SELECT; - if (cm->interp_filter == SWITCHABLE) { - if (frame_type != ALTREF_FRAME && - filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP] && - filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP_SHARP] && - filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[SWITCHABLE - 1]) { - cm->interp_filter = EIGHTTAP_SMOOTH; - } else if (filter_thresh[EIGHTTAP_SHARP] > filter_thresh[EIGHTTAP] && - filter_thresh[EIGHTTAP_SHARP] > filter_thresh[SWITCHABLE - 1]) { - cm->interp_filter = EIGHTTAP_SHARP; - } else if (filter_thresh[EIGHTTAP] > filter_thresh[SWITCHABLE - 1]) { - cm->interp_filter = EIGHTTAP; - } - } + if (cm->interp_filter == SWITCHABLE) + cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); encode_frame_internal(cpi); - for (i = 0; i < REFERENCE_MODES; ++i) { - const int diff = (int) (cpi->rd_comp_pred_diff[i] / cm->MBs); - cpi->rd_prediction_type_threshes[frame_type][i] += diff; - cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; - } + for (i = 0; i < REFERENCE_MODES; ++i) + mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2; - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { - const int64_t diff = cpi->rd_filter_diff[i] / cm->MBs; - cpi->rd_filter_threshes[frame_type][i] = - (cpi->rd_filter_threshes[frame_type][i] + diff) / 2; - } + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2; for (i = 0; i < TX_MODES; ++i) { - int64_t pd = cpi->rd_tx_select_diff[i]; - int diff; + int64_t pd = rd_opt->tx_select_diff[i]; if (i == TX_MODE_SELECT) pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0); - diff = (int) (pd / cm->MBs); - cpi->rd_tx_select_threshes[frame_type][i] += diff; - cpi->rd_tx_select_threshes[frame_type][i] /= 2; + tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2; } if (cm->reference_mode == REFERENCE_MODE_SELECT) { @@ -3494,28 +3480,27 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && count32x32 == 0) { cm->tx_mode = ALLOW_8X8; - reset_skip_txfm_size(cm, TX_8X8); + reset_skip_tx_size(cm, TX_8X8); } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { cm->tx_mode = ONLY_4X4; - reset_skip_txfm_size(cm, TX_4X4); + reset_skip_tx_size(cm, TX_4X4); } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { cm->tx_mode = ALLOW_32X32; } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) { cm->tx_mode = ALLOW_16X16; - reset_skip_txfm_size(cm, TX_16X16); + reset_skip_tx_size(cm, TX_16X16); } } } else { cm->reference_mode = SINGLE_REFERENCE; - cm->interp_filter = SWITCHABLE; encode_frame_internal(cpi); } } static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) { - const MB_PREDICTION_MODE y_mode = mi->mbmi.mode; - const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode; + const PREDICTION_MODE y_mode = mi->mbmi.mode; + const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode; const BLOCK_SIZE bsize = mi->mbmi.sb_type; if (bsize < BLOCK_8X8) { @@ -3532,24 +3517,6 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) { ++counts->uv_mode[y_mode][uv_mode]; } -// Experimental stub function to create a per MB zbin adjustment based on -// some previously calculated measure of MB activity. -static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) { -#if USE_ACT_INDEX - x->act_zbin_adj = *(x->mb_activity_ptr); -#else - // Apply the masking to the RD multiplier. - const int64_t act = *(x->mb_activity_ptr); - const int64_t a = act + 4 * cpi->activity_avg; - const int64_t b = 4 * act + cpi->activity_avg; - - if (act > cpi->activity_avg) - x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1; - else - x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b); -#endif -} - static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) { if (enabled) { if (is_inter_block(mbmi)) { @@ -3569,24 +3536,28 @@ static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) { } static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, - int mi_row, int mi_col, BLOCK_SIZE bsize) { + int mi_row, int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO **mi_8x8 = xd->mi; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; - PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize); - unsigned int segment_id = mbmi->segment_id; + const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); const int mis = cm->mi_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 && + x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 && cpi->oxcf.aq_mode != COMPLEXITY_AQ && cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ && cpi->sf.allow_skip_recode; + if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode) + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); + x->skip_optimize = ctx->is_coded; ctx->is_coded = 1; x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; @@ -3596,25 +3567,13 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, if (x->skip_encode) return; - if (cm->frame_type == KEY_FRAME) { - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { - adjust_act_zbin(cpi, x); - vp9_update_zbin_extra(cpi, x); - } - } else { - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { - // Adjust the zbin based on this MB rate. - adjust_act_zbin(cpi, x); - } - - // Experimental code. Special case for gf and arf zeromv modes. - // Increase zbin size to suppress noise - cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi, - cpi->zbin_mode_boost_enabled); - vp9_update_zbin_extra(cpi, x); - } + // Experimental code. Special case for gf and arf zeromv modes. + // Increase zbin size to suppress noise + cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi, + cpi->zbin_mode_boost_enabled); + vp9_update_zbin_extra(cpi, x); if (!is_inter_block(mbmi)) { int plane; @@ -3633,7 +3592,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf); } - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); + if (!cpi->sf.reuse_inter_pred_sby || seg_skip) + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); + + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); if (!x->skip) { mbmi->skip = 1; @@ -3641,7 +3603,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else { mbmi->skip = 1; - if (output_enabled) + if (output_enabled && !seg_skip) cm->counts.skip[vp9_get_skip_context(xd)][1]++; reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); } @@ -3650,9 +3612,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, if (output_enabled) { if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 && - !(is_inter_block(mbmi) && - (mbmi->skip || - vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { + !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) { ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd), &cm->counts.tx)[mbmi->tx_size]; } else { diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h index 131e93201..fd1c9aa64 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/libvpx/vp9/encoder/vp9_encodeframe.h @@ -20,11 +20,12 @@ struct macroblock; struct yv12_buffer_config; struct VP9_COMP; -typedef struct { - unsigned int sse; - int sum; - unsigned int var; -} diff; +// Constants used in SOURCE_VAR_BASED_PARTITION +#define VAR_HIST_MAX_BG_VAR 1000 +#define VAR_HIST_FACTOR 10 +#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1) +#define VAR_HIST_LARGE_CUT_OFF 75 +#define VAR_HIST_SMALL_CUT_OFF 45 void vp9_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 5e98e4e3f..8a737e18e 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -21,7 +21,7 @@ #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" struct optimize_ctx { @@ -32,7 +32,7 @@ struct optimize_ctx { struct encode_b_args { MACROBLOCK *x; struct optimize_ctx *ctx; - unsigned char *skip; + int8_t *skip; }; void vp9_subtract_block_c(int rows, int cols, @@ -63,24 +63,17 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { } #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) -typedef struct vp9_token_state vp9_token_state; -struct vp9_token_state { +typedef struct vp9_token_state { int rate; int error; int next; signed char token; short qc; -}; +} vp9_token_state; // TODO(jimbankoski): experiment to find optimal RD numbers. -#define Y1_RD_MULT 4 -#define UV_RD_MULT 2 - -static const int plane_rd_mult[4] = { - Y1_RD_MULT, - UV_RD_MULT, -}; +static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 }; #define UPDATE_RD_COST()\ {\ @@ -105,60 +98,56 @@ static int trellis_get_coeff_context(const int16_t *scan, return pt; } -static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, MACROBLOCK *mb, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { +static int optimize_b(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, int ctx) { MACROBLOCKD *const xd = &mb->e_mbd; - struct macroblock_plane *p = &mb->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; + struct macroblock_plane *const p = &mb->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; const int ref = is_inter_block(&xd->mi[0]->mbmi); vp9_token_state tokens[1025][2]; unsigned best_index[1025][2]; - const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); - int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - int eob = p->eobs[block], final_eob, sz = 0; - const int i0 = 0; - int rc, x, next, i; - int64_t rdmult, rddiv, rd_cost0, rd_cost1; - int rate0, rate1, error0, error1, t0, t1; - int best, band, pt; - PLANE_TYPE type = pd->plane_type; - int err_mult = plane_rd_mult[type]; + uint8_t token_cache[1024]; + const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int eob = p->eobs[block]; + const PLANE_TYPE type = pd->plane_type; const int default_eob = 16 << (tx_size << 1); const int mul = 1 + (tx_size == TX_32X32); - uint8_t token_cache[1024]; const int16_t *dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *so = get_scan(xd, tx_size, type, block); - const int16_t *scan = so->scan; - const int16_t *nb = so->neighbors; + const scan_order *const so = get_scan(xd, tx_size, type, block); + const int16_t *const scan = so->scan; + const int16_t *const nb = so->neighbors; + int next = eob, sz = 0; + int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv; + int64_t rd_cost0, rd_cost1; + int rate0, rate1, error0, error1, t0, t1; + int best, band, pt, i, final_eob; assert((!type && !plane) || (type && plane)); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - rdmult = mb->rdmult * err_mult; - if (!is_inter_block(&mb->e_mbd.mi[0]->mbmi)) + if (!ref) rdmult = (rdmult * 9) >> 4; - rddiv = mb->rddiv; + /* Initialize the sentinel node of the trellis. */ tokens[eob][0].rate = 0; tokens[eob][0].error = 0; tokens[eob][0].next = default_eob; tokens[eob][0].token = EOB_TOKEN; tokens[eob][0].qc = 0; - *(tokens[eob] + 1) = *(tokens[eob] + 0); - next = eob; + tokens[eob][1] = tokens[eob][0]; + for (i = 0; i < eob; i++) - token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ - qcoeff[scan[i]]].token]; + token_cache[scan[i]] = + vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token]; - for (i = eob; i-- > i0;) { + for (i = eob; i-- > 0;) { int base_bits, d2, dx; - - rc = scan[i]; - x = qcoeff[rc]; + const int rc = scan[i]; + int x = qcoeff[rc]; /* Only add a trellis state for non-zero coefficients. */ if (x) { int shortcut = 0; @@ -172,17 +161,15 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, if (next < default_eob) { band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += - mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][0].token]; - rate1 += - mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][1].token]; + rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] + [tokens[next][0].token]; + rate1 += mb->token_costs[tx_size][type][ref][band][0][pt] + [tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = *(vp9_dct_value_cost_ptr + x); + base_bits = vp9_dct_value_cost_ptr[x]; dx = mul * (dqcoeff[rc] - coeff[rc]); d2 = dx * dx; tokens[i][0].rate = base_bits + (best ? rate1 : rate0); @@ -196,9 +183,9 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) && - (abs(x)*dequant_ptr[rc != 0] < abs(coeff[rc]) * mul + - dequant_ptr[rc != 0])) + if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) && + (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul + + dequant_ptr[rc != 0])) shortcut = 1; else shortcut = 0; @@ -235,7 +222,7 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = *(vp9_dct_value_cost_ptr + x); + base_bits = vp9_dct_value_cost_ptr[x]; if (shortcut) { dx -= (dequant_ptr[rc != 0] + sz) ^ sz; @@ -274,26 +261,26 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, /* Now pick the best path through the whole trellis. */ band = band_translate[i + 1]; - pt = combine_entropy_contexts(*a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; error0 = tokens[next][0].error; error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1]; + rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0]; + rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; - final_eob = i0 - 1; + final_eob = -1; vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2))); vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2))); for (i = next; i < eob; i = next) { - x = tokens[i][best].qc; + const int x = tokens[i][best].qc; + const int rc = scan[i]; if (x) { final_eob = i; } - rc = scan[i]; + qcoeff[rc] = x; dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul; @@ -303,7 +290,7 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, final_eob++; mb->plane[plane].eobs[block] = final_eob; - *a = *l = (final_eob > 0); + return final_eob; } static INLINE void fdct32x32(int rd_transform, @@ -314,6 +301,104 @@ static INLINE void fdct32x32(int rd_transform, vp9_fdct32x32(src, dst, src_stride); } +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); + break; + case TX_16X16: + vp9_fdct16x16(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_8X8: + vp9_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + default: + assert(0); + break; + } +} + +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + vp9_fdct32x32_1(src_diff, coeff, diff_stride); + vp9_quantize_dc_32x32(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_16X16: + vp9_fdct16x16_1(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_8X8: + vp9_fdct8x8_1(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + default: + assert(0); + break; + } +} + void vp9_xform_quant(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; @@ -361,6 +446,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, break; default: assert(0); + break; } } @@ -389,11 +475,27 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, return; } - if (!x->skip_recode) - vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + if (!x->skip_recode) { + if (x->skip_txfm[plane] == 0) { + // full forward transform and quantization + if (x->quant_fp) + vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); + else + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } else if (x->skip_txfm[plane] == 2) { + // fast path forward transform and quantization + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + } else { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; + return; + } + } if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { - optimize_b(plane, block, plane_bsize, tx_size, x, a, l); + const int ctx = combine_entropy_contexts(*a, *l); + *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0; } else { *a = *l = p->eobs[block] > 0; } @@ -418,10 +520,11 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; default: assert(0 && "Invalid transform size"); + break; } } @@ -440,7 +543,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, vp9_xform_quant(x, plane, block, plane_bsize, tx_size); if (p->eobs[block] > 0) - xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { @@ -462,7 +565,7 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { const struct macroblockd_plane* const pd = &xd->plane[plane]; - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); } @@ -485,7 +588,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const scan_order *scan_order; TX_TYPE tx_type; - MB_PREDICTION_MODE mode; + PREDICTION_MODE mode; const int bwl = b_width_log2(plane_bsize); const int diff_stride = 4 * (1 << bwl); uint8_t *src, *dst; @@ -586,13 +689,14 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - xd->itxm_add(dqcoeff, dst, dst_stride, *eob); + x->itxm_add(dqcoeff, dst, dst_stride, *eob); else vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; default: assert(0); + break; } if (*eob) *(args->skip) = 0; @@ -600,7 +704,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - unsigned char *skip) { + int8_t *skip) { struct encode_b_args arg = {x, NULL, skip}; encode_block_intra(plane, block, plane_bsize, tx_size, &arg); } @@ -613,15 +717,3 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra, &arg); } - -int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) { - MB_MODE_INFO * mbmi = &x->e_mbd.mi[0]->mbmi; - x->skip_encode = 0; - mbmi->mode = DC_PRED; - mbmi->ref_frame[0] = INTRA_FRAME; - mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16 - : TX_8X8) - : TX_4X4; - vp9_encode_intra_block_plane(x, mbmi->sb_type, 0); - return vp9_get_mb_ss(x->plane[0].src_diff); -} diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h index dcf6e8759..199971865 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.h +++ b/libvpx/vp9/encoder/vp9_encodemb.h @@ -13,7 +13,7 @@ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/common/vp9_onyxc_int.h" #ifdef __cplusplus @@ -22,7 +22,10 @@ extern "C" { void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); - +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); void vp9_xform_quant(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size); @@ -30,12 +33,10 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - unsigned char *skip); + int8_t *skip); void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); -int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred); - #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c index 9d4486511..9ad6db05d 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libvpx/vp9/encoder/vp9_encodemv.c @@ -216,7 +216,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, // If auto_mv_step_size is enabled then keep track of the largest // motion vector component used. - if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) { + if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) { unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3; cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude); } diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h index 50cb9611b..e67f9e3b0 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.h +++ b/libvpx/vp9/encoder/vp9_encodemv.h @@ -12,7 +12,7 @@ #ifndef VP9_ENCODER_VP9_ENCODEMV_H_ #define VP9_ENCODER_VP9_ENCODEMV_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #ifdef __cplusplus extern "C" { diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_encoder.c index 3619ec89e..524744cd9 100644 --- a/libvpx/vp9/encoder/vp9_onyx_if.c +++ b/libvpx/vp9/encoder/vp9_encoder.c @@ -31,14 +31,15 @@ #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_aq_variance.h" #include "vp9/encoder/vp9_bitstream.h" +#include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_speed_features.h" #if CONFIG_INTERNAL_STATS @@ -50,8 +51,6 @@ void vp9_coef_tree_initialize(); -#define DEFAULT_INTERP_FILTER SWITCHABLE - #define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ #define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv @@ -61,15 +60,10 @@ void vp9_coef_tree_initialize(); // now so that HIGH_PRECISION is always // chosen. -// Max rate target for 1080P and below encodes under normal circumstances -// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB -#define MAX_MB_RATE 250 -#define MAXRATE_1080P 2025000 - // #define OUTPUT_YUV_REC -#ifdef OUTPUT_YUV_SRC -FILE *yuv_file; +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file = NULL; #endif #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; @@ -81,8 +75,6 @@ FILE *kf_list; FILE *keyfile; #endif -void vp9_init_quantizer(VP9_COMP *cpi); - static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { switch (mode) { case NORMAL: @@ -109,7 +101,7 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { } } -static void set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) { +void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) { MACROBLOCK *const mb = &cpi->mb; cpi->common.allow_high_precision_mv = allow_high_precision_mv; if (cpi->common.allow_high_precision_mv) { @@ -121,20 +113,27 @@ static void set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) { } } -static void setup_key_frame(VP9_COMP *cpi) { - vp9_setup_past_independence(&cpi->common); - - // All buffers are implicitly updated on key frames. - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; -} - -static void setup_inter_frame(VP9_COMMON *cm) { - if (cm->error_resilient_mode || cm->intra_only) +static void setup_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + if (frame_is_intra_only(cm) || cm->error_resilient_mode) { vp9_setup_past_independence(cm); + } else { + if (!cpi->use_svc) + cm->frame_context_idx = cpi->refresh_alt_ref_frame; + } - assert(cm->frame_context_idx < FRAME_CONTEXTS); - cm->fc = cm->frame_contexts[cm->frame_context_idx]; + if (cm->frame_type == KEY_FRAME) { + if (!is_spatial_svc(cpi)) + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + } else { + cm->fc = cm->frame_contexts[cm->frame_context_idx]; + } } void vp9_initialize_enc() { @@ -142,14 +141,13 @@ void vp9_initialize_enc() { if (!init_done) { vp9_init_neighbors(); - vp9_init_quant_tables(); - vp9_coef_tree_initialize(); vp9_tokenize_initialize(); vp9_init_me_luts(); vp9_rc_init_minq_luts(); vp9_entropy_mv_init(); vp9_entropy_mode_init(); + vp9_temporal_filter_init(); init_done = 1; } } @@ -172,10 +170,8 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; - vpx_free(cpi->active_map); - cpi->active_map = NULL; - - vp9_free_frame_buffers(cm); + vp9_free_ref_frame_buffers(cm); + vp9_free_context_buffers(cm); vp9_free_frame_buffer(&cpi->last_frame_uf); vp9_free_frame_buffer(&cpi->scaled_source); @@ -186,11 +182,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->tok); cpi->tok = 0; - // Activity mask based per mb zbin adjustments - vpx_free(cpi->mb_activity_map); - cpi->mb_activity_map = 0; - vpx_free(cpi->mb_norm_activity_map); - cpi->mb_norm_activity_map = 0; + vp9_free_pc_tree(cpi); for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i]; @@ -198,6 +190,17 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { lc->rc_twopass_stats_in.buf = NULL; lc->rc_twopass_stats_in.sz = 0; } + + if (cpi->source_diff_var != NULL) { + vpx_free(cpi->source_diff_var); + cpi->source_diff_var = NULL; + } + + for (i = 0; i < MAX_LAG_BUFFERS; ++i) { + vp9_free_frame_buffer(&cpi->svc.scaled_frames[i]); + } + vpx_memset(&cpi->svc.scaled_frames[0], 0, + MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0])); } static void save_coding_context(VP9_COMP *cpi) { @@ -367,27 +370,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) { } } -// DEBUG: Print out the segment id of each MB in the current frame. -static void print_seg_map(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int row, col; - int map_index = 0; - FILE *statsfile = fopen("segmap.stt", "a"); - - fprintf(statsfile, "%10d\n", cm->current_video_frame); - - for (row = 0; row < cpi->common.mi_rows; row++) { - for (col = 0; col < cpi->common.mi_cols; col++) { - fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]); - map_index++; - } - fprintf(statsfile, "\n"); - } - fprintf(statsfile, "\n"); - - fclose(statsfile); -} - static void update_reference_segmentation_map(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; @@ -403,124 +385,7 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) { cache_ptr += cm->mi_cols; } } -static int is_slowest_mode(int mode) { - return (mode == MODE_SECONDPASS_BEST || mode == MODE_BESTQUALITY); -} - -static void set_rd_speed_thresholds(VP9_COMP *cpi) { - int i; - // Set baseline threshold values - for (i = 0; i < MAX_MODES; ++i) - cpi->rd_thresh_mult[i] = is_slowest_mode(cpi->oxcf.mode) ? -500 : 0; - - cpi->rd_thresh_mult[THR_NEARESTMV] = 0; - cpi->rd_thresh_mult[THR_NEARESTG] = 0; - cpi->rd_thresh_mult[THR_NEARESTA] = 0; - - cpi->rd_thresh_mult[THR_DC] += 1000; - - cpi->rd_thresh_mult[THR_NEWMV] += 1000; - cpi->rd_thresh_mult[THR_NEWA] += 1000; - cpi->rd_thresh_mult[THR_NEWG] += 1000; - - cpi->rd_thresh_mult[THR_NEARMV] += 1000; - cpi->rd_thresh_mult[THR_NEARA] += 1000; - cpi->rd_thresh_mult[THR_COMP_NEARESTLA] += 1000; - cpi->rd_thresh_mult[THR_COMP_NEARESTGA] += 1000; - - cpi->rd_thresh_mult[THR_TM] += 1000; - - cpi->rd_thresh_mult[THR_COMP_NEARLA] += 1500; - cpi->rd_thresh_mult[THR_COMP_NEWLA] += 2000; - cpi->rd_thresh_mult[THR_NEARG] += 1000; - cpi->rd_thresh_mult[THR_COMP_NEARGA] += 1500; - cpi->rd_thresh_mult[THR_COMP_NEWGA] += 2000; - - cpi->rd_thresh_mult[THR_ZEROMV] += 2000; - cpi->rd_thresh_mult[THR_ZEROG] += 2000; - cpi->rd_thresh_mult[THR_ZEROA] += 2000; - cpi->rd_thresh_mult[THR_COMP_ZEROLA] += 2500; - cpi->rd_thresh_mult[THR_COMP_ZEROGA] += 2500; - - cpi->rd_thresh_mult[THR_H_PRED] += 2000; - cpi->rd_thresh_mult[THR_V_PRED] += 2000; - cpi->rd_thresh_mult[THR_D45_PRED ] += 2500; - cpi->rd_thresh_mult[THR_D135_PRED] += 2500; - cpi->rd_thresh_mult[THR_D117_PRED] += 2500; - cpi->rd_thresh_mult[THR_D153_PRED] += 2500; - cpi->rd_thresh_mult[THR_D207_PRED] += 2500; - cpi->rd_thresh_mult[THR_D63_PRED] += 2500; - - /* disable frame modes if flags not set */ - if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { - cpi->rd_thresh_mult[THR_NEWMV ] = INT_MAX; - cpi->rd_thresh_mult[THR_NEARESTMV] = INT_MAX; - cpi->rd_thresh_mult[THR_ZEROMV ] = INT_MAX; - cpi->rd_thresh_mult[THR_NEARMV ] = INT_MAX; - } - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { - cpi->rd_thresh_mult[THR_NEARESTG ] = INT_MAX; - cpi->rd_thresh_mult[THR_ZEROG ] = INT_MAX; - cpi->rd_thresh_mult[THR_NEARG ] = INT_MAX; - cpi->rd_thresh_mult[THR_NEWG ] = INT_MAX; - } - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { - cpi->rd_thresh_mult[THR_NEARESTA ] = INT_MAX; - cpi->rd_thresh_mult[THR_ZEROA ] = INT_MAX; - cpi->rd_thresh_mult[THR_NEARA ] = INT_MAX; - cpi->rd_thresh_mult[THR_NEWA ] = INT_MAX; - } - - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != - (VP9_LAST_FLAG | VP9_ALT_FLAG)) { - cpi->rd_thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; - cpi->rd_thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; - cpi->rd_thresh_mult[THR_COMP_NEARLA ] = INT_MAX; - cpi->rd_thresh_mult[THR_COMP_NEWLA ] = INT_MAX; - } - if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != - (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { - cpi->rd_thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; - cpi->rd_thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; - cpi->rd_thresh_mult[THR_COMP_NEARGA ] = INT_MAX; - cpi->rd_thresh_mult[THR_COMP_NEWGA ] = INT_MAX; - } -} - -static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { - const SPEED_FEATURES *const sf = &cpi->sf; - int i; - - for (i = 0; i < MAX_REFS; ++i) - cpi->rd_thresh_mult_sub8x8[i] = is_slowest_mode(cpi->oxcf.mode) ? -500 : 0; - - cpi->rd_thresh_mult_sub8x8[THR_LAST] += 2500; - cpi->rd_thresh_mult_sub8x8[THR_GOLD] += 2500; - cpi->rd_thresh_mult_sub8x8[THR_ALTR] += 2500; - cpi->rd_thresh_mult_sub8x8[THR_INTRA] += 2500; - cpi->rd_thresh_mult_sub8x8[THR_COMP_LA] += 4500; - cpi->rd_thresh_mult_sub8x8[THR_COMP_GA] += 4500; - - // Check for masked out split cases. - for (i = 0; i < MAX_REFS; i++) - if (sf->disable_split_mask & (1 << i)) - cpi->rd_thresh_mult_sub8x8[i] = INT_MAX; - - // disable mode test if frame flag is not set - if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) - cpi->rd_thresh_mult_sub8x8[THR_LAST] = INT_MAX; - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) - cpi->rd_thresh_mult_sub8x8[THR_GOLD] = INT_MAX; - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) - cpi->rd_thresh_mult_sub8x8[THR_ALTR] = INT_MAX; - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != - (VP9_LAST_FLAG | VP9_ALT_FLAG)) - cpi->rd_thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX; - if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != - (VP9_GOLD_FLAG | VP9_ALT_FLAG)) - cpi->rd_thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX; -} static void set_speed_features(VP9_COMP *cpi) { #if CONFIG_INTERNAL_STATS @@ -532,18 +397,13 @@ static void set_speed_features(VP9_COMP *cpi) { vp9_set_speed_features(cpi); // Set rd thresholds based on mode and speed setting - set_rd_speed_thresholds(cpi); - set_rd_speed_thresholds_sub8x8(cpi); - - cpi->mb.fwd_txm4x4 = vp9_fdct4x4; - if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm4x4 = vp9_fwht4x4; - } + vp9_set_rd_speed_thresholds(cpi); + vp9_set_rd_speed_thresholds_sub8x8(cpi); } static void alloc_raw_frame_buffers(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - const VP9_CONFIG *oxcf = &cpi->oxcf; + const VP9EncoderConfig *oxcf = &cpi->oxcf; cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, @@ -560,163 +420,73 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { "Failed to allocate altref buffer"); } -void vp9_alloc_compressor_data(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - - if (vp9_alloc_frame_buffers(cm, cm->width, cm->height)) +static void alloc_ref_frame_buffers(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (vp9_alloc_ref_frame_buffers(cm, cm->width, cm->height)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); - - if (vp9_alloc_frame_buffer(&cpi->last_frame_uf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate last frame buffer"); - - if (vp9_alloc_frame_buffer(&cpi->scaled_source, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate scaled source buffer"); - - if (vp9_alloc_frame_buffer(&cpi->scaled_last_source, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate scaled last source buffer"); - - vpx_free(cpi->tok); - - { - unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); - - CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); - } - - vpx_free(cpi->mb_activity_map); - CHECK_MEM_ERROR(cm, cpi->mb_activity_map, - vpx_calloc(sizeof(unsigned int), - cm->mb_rows * cm->mb_cols)); - - vpx_free(cpi->mb_norm_activity_map); - CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map, - vpx_calloc(sizeof(unsigned int), - cm->mb_rows * cm->mb_cols)); } - -static void update_frame_size(VP9_COMP *cpi) { +static void alloc_util_frame_buffers(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; - - vp9_update_frame_size(cm); - - // Update size of buffers local to this frame if (vp9_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate last frame buffer"); + "Failed to allocate last frame buffer"); if (vp9_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate scaled source buffer"); + "Failed to allocate scaled source buffer"); if (vp9_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate scaled last source buffer"); - - { - int y_stride = cpi->scaled_source.y_stride; - - if (cpi->sf.search_method == NSTEP) { - vp9_init3smotion_compensation(&cpi->mb, y_stride); - } else if (cpi->sf.search_method == DIAMOND) { - vp9_init_dsmotion_compensation(&cpi->mb, y_stride); - } - } - - init_macroblockd(cm, xd); + "Failed to allocate scaled last source buffer"); } -// Table that converts 0-63 Q range values passed in outside to the Qindex -// range used internally. -const int q_trans[] = { - 0, 4, 8, 12, 16, 20, 24, 28, - 32, 36, 40, 44, 48, 52, 56, 60, - 64, 68, 72, 76, 80, 84, 88, 92, - 96, 100, 104, 108, 112, 116, 120, 124, - 128, 132, 136, 140, 144, 148, 152, 156, - 160, 164, 168, 172, 176, 180, 184, 188, - 192, 196, 200, 204, 208, 212, 216, 220, - 224, 228, 232, 236, 240, 244, 249, 255, -}; - -int vp9_reverse_trans(int x) { - int i; - - for (i = 0; i < 64; i++) - if (q_trans[i] >= x) - return i; - - return 63; -}; - -void vp9_new_framerate(VP9_COMP *cpi, double framerate) { - VP9_COMMON *const cm = &cpi->common; - RATE_CONTROL *const rc = &cpi->rc; - VP9_CONFIG *const oxcf = &cpi->oxcf; - int vbr_max_bits; +void vp9_alloc_compressor_data(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; - oxcf->framerate = framerate < 0.1 ? 30 : framerate; - cpi->output_framerate = cpi->oxcf.framerate; - rc->av_per_frame_bandwidth = (int)(oxcf->target_bandwidth / - cpi->output_framerate); - rc->min_frame_bandwidth = (int)(rc->av_per_frame_bandwidth * - oxcf->two_pass_vbrmin_section / 100); + vp9_alloc_context_buffers(cm, cm->width, cm->height); - rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + vpx_free(cpi->tok); - // A maximum bitrate for a frame is defined. - // The baseline for this aligns with HW implementations that - // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits - // per 16x16 MB (averaged over a frame). However this limit is extended if - // a very high rate is given on the command line or the the rate cannnot - // be acheived because of a user specificed max q (e.g. when the user - // specifies lossless encode. - // - vbr_max_bits = (int)(((int64_t)rc->av_per_frame_bandwidth * - oxcf->two_pass_vbrmax_section) / 100); - rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), - vbr_max_bits); + { + unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); + CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + } - // Set Maximum gf/arf interval - rc->max_gf_interval = 16; + vp9_setup_pc_tree(&cpi->common, cpi); +} - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = cpi->key_frame_frequency >> 1; +static void update_frame_size(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; - // Special conditions when alt ref frame enabled in lagged compress mode - if (oxcf->play_alternate && oxcf->lag_in_frames) { - if (rc->max_gf_interval > oxcf->lag_in_frames - 1) - rc->max_gf_interval = oxcf->lag_in_frames - 1; + vp9_set_mb_mi(cm, cm->width, cm->height); + vp9_init_context_buffers(cm); + init_macroblockd(cm, xd); - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + if (is_spatial_svc(cpi)) { + if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate alt_ref_buffer"); } +} - if (rc->max_gf_interval > rc->static_scene_max_gf_interval) - rc->max_gf_interval = rc->static_scene_max_gf_interval; +void vp9_new_framerate(VP9_COMP *cpi, double framerate) { + cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate; + vp9_rc_update_framerate(cpi); } int64_t vp9_rescale(int64_t val, int64_t num, int denom) { @@ -738,19 +508,23 @@ static void set_tile_limits(VP9_COMP *cpi) { cm->log2_tile_rows = cpi->oxcf.tile_rows; } -static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) { +static void init_buffer_indices(VP9_COMP *cpi) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; +} + +static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; - int i; cpi->oxcf = *oxcf; cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; + cm->color_space = UNKNOWN; cm->width = oxcf->width; cm->height = oxcf->height; - cm->subsampling_x = 0; - cm->subsampling_y = 0; vp9_alloc_compressor_data(cpi); // Spatial scalability. @@ -759,9 +533,9 @@ static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) { cpi->svc.number_temporal_layers = oxcf->ts_number_layers; if ((cpi->svc.number_temporal_layers > 1 && - cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) || + cpi->oxcf.rc_mode == VPX_CBR) || (cpi->svc.number_spatial_layers > 1 && - cpi->oxcf.mode == MODE_SECONDPASS_BEST)) { + cpi->oxcf.mode == TWO_PASS_SECOND_BEST)) { vp9_init_layer_context(cpi); } @@ -769,19 +543,14 @@ static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) { vp9_change_config(cpi, oxcf); cpi->static_mb_pct = 0; + cpi->ref_frame_flags = 0; - cpi->lst_fb_idx = 0; - cpi->gld_fb_idx = 1; - cpi->alt_fb_idx = 2; + init_buffer_indices(cpi); set_tile_limits(cpi); - - cpi->fixed_divide[0] = 0; - for (i = 1; i < 512; i++) - cpi->fixed_divide[i] = 0x80000 / i; } -void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { +void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -796,50 +565,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { cpi->oxcf = *oxcf; - if (cpi->oxcf.cpu_used == -6) - cpi->oxcf.play_alternate = 0; - - switch (cpi->oxcf.mode) { - // Real time and one pass deprecated in test code base - case MODE_GOODQUALITY: - cpi->pass = 0; - cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5); - break; - - case MODE_BESTQUALITY: - cpi->pass = 0; - break; - - case MODE_FIRSTPASS: - cpi->pass = 1; - break; - - case MODE_SECONDPASS: - cpi->pass = 2; - cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5); - break; - - case MODE_SECONDPASS_BEST: - cpi->pass = 2; - break; - - case MODE_REALTIME: - cpi->pass = 0; - break; - } - - cpi->oxcf.lossless = oxcf->lossless; - if (cpi->oxcf.lossless) { - // In lossless mode, make sure right quantizer range and correct transform - // is set. - cpi->oxcf.worst_allowed_q = 0; - cpi->oxcf.best_allowed_q = 0; - cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; - } else { - cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; - } rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; - cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 1; @@ -847,7 +573,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { cm->reset_frame_context = 0; vp9_reset_segment_features(&cm->seg); - set_high_precision_mv(cpi, 0); + vp9_set_high_precision_mv(cpi, 0); { int i; @@ -858,37 +584,31 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { cpi->encode_breakout = cpi->oxcf.encode_breakout; // local file playback mode == really big buffer - if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) { - cpi->oxcf.starting_buffer_level = 60000; - cpi->oxcf.optimal_buffer_level = 60000; - cpi->oxcf.maximum_buffer_size = 240000; + if (cpi->oxcf.rc_mode == VPX_VBR) { + cpi->oxcf.starting_buffer_level_ms = 60000; + cpi->oxcf.optimal_buffer_level_ms = 60000; + cpi->oxcf.maximum_buffer_size_ms = 240000; } - // Convert target bandwidth from Kbit/s to Bit/s - cpi->oxcf.target_bandwidth *= 1000; - - cpi->oxcf.starting_buffer_level = - vp9_rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); + rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms, + cpi->oxcf.target_bandwidth, 1000); // Set or reset optimal and maximum buffer levels. - if (cpi->oxcf.optimal_buffer_level == 0) - cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; + if (cpi->oxcf.optimal_buffer_level_ms == 0) + rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else - cpi->oxcf.optimal_buffer_level = - vp9_rescale(cpi->oxcf.optimal_buffer_level, - cpi->oxcf.target_bandwidth, 1000); + rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms, + cpi->oxcf.target_bandwidth, 1000); - if (cpi->oxcf.maximum_buffer_size == 0) - cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; + if (cpi->oxcf.maximum_buffer_size_ms == 0) + rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; else - cpi->oxcf.maximum_buffer_size = - vp9_rescale(cpi->oxcf.maximum_buffer_size, - cpi->oxcf.target_bandwidth, 1000); + rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms, + cpi->oxcf.target_bandwidth, 1000); // Under a configuration change, where maximum_buffer_size may change, // keep buffer level clipped to the maximum allowed buffer size. - rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size); - rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size); + rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size); // Set up frame rate and related parameters rate control values. vp9_new_framerate(cpi, cpi->oxcf.framerate); @@ -897,20 +617,11 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { rc->worst_quality = cpi->oxcf.worst_allowed_q; rc->best_quality = cpi->oxcf.best_allowed_q; - // active values should only be modified if out of new range - - cpi->cq_target_quality = cpi->oxcf.cq_level; - - cm->interp_filter = DEFAULT_INTERP_FILTER; + cm->interp_filter = cpi->sf.default_interp_filter; cm->display_width = cpi->oxcf.width; cm->display_height = cpi->oxcf.height; - // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) - cpi->oxcf.sharpness = MIN(7, cpi->oxcf.sharpness); - - cpi->common.lf.sharpness_level = cpi->oxcf.sharpness; - if (cpi->initial_width) { // Increasing the size of the frame beyond the first seen frame, or some // otherwise signaled maximum size, is not supported. @@ -921,23 +632,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { update_frame_size(cpi); if ((cpi->svc.number_temporal_layers > 1 && - cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) || - (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) { + cpi->oxcf.rc_mode == VPX_CBR) || + (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) { vp9_update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth); } - cpi->speed = abs(cpi->oxcf.cpu_used); - - // Limit on lag buffers as these are not currently dynamically allocated. - if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) - cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; - -#if CONFIG_MULTIPLE_ARF - vp9_zero(cpi->alt_ref_source); -#else cpi->alt_ref_source = NULL; -#endif rc->is_src_frame_alt_ref = 0; #if 0 @@ -950,9 +651,19 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) { cpi->ext_refresh_frame_flags_pending = 0; cpi->ext_refresh_frame_context_pending = 0; + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS); + } +#endif } +#ifndef M_LOG2_E #define M_LOG2_E 0.693147180559945309417 +#endif #define log2f(x) (log (x) / (float) M_LOG2_E) static void cal_nmvjointsadcost(int *mvjointsadcost) { @@ -992,127 +703,9 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { } while (++i <= MV_MAX); } -static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, - PICK_MODE_CONTEXT *ctx) { - int num_pix = num_4x4_blk << 4; - int i, k; - ctx->num_4x4_blk = num_4x4_blk; - - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); - for (i = 0; i < MAX_MB_PLANE; ++i) { - for (k = 0; k < 3; ++k) { - CHECK_MEM_ERROR(cm, ctx->coeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); - CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); - CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); - CHECK_MEM_ERROR(cm, ctx->eobs[i][k], - vpx_memalign(16, num_pix * sizeof(uint16_t))); - ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; - ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; - ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; - ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; - } - } -} - -static void free_mode_context(PICK_MODE_CONTEXT *ctx) { - int i, k; - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; - for (i = 0; i < MAX_MB_PLANE; ++i) { - for (k = 0; k < 3; ++k) { - vpx_free(ctx->coeff[i][k]); - ctx->coeff[i][k] = 0; - vpx_free(ctx->qcoeff[i][k]); - ctx->qcoeff[i][k] = 0; - vpx_free(ctx->dqcoeff[i][k]); - ctx->dqcoeff[i][k] = 0; - vpx_free(ctx->eobs[i][k]); - ctx->eobs[i][k] = 0; - } - } -} - -static void init_pick_mode_context(VP9_COMP *cpi) { - int i; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - - for (i = 0; i < BLOCK_SIZES; ++i) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; - const int num_4x4_h = num_4x4_blocks_high_lookup[i]; - const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); - if (i < BLOCK_16X16) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { - for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } - } - } else if (i < BLOCK_32X32) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } - } else if (i < BLOCK_64X64) { - for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } else { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } -} - -static void free_pick_mode_context(MACROBLOCK *x) { - int i; - for (i = 0; i < BLOCK_SIZES; ++i) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; - const int num_4x4_h = num_4x4_blocks_high_lookup[i]; - const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); - if (i < BLOCK_16X16) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { - for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } - } - } else if (i < BLOCK_32X32) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } - } else if (i < BLOCK_64X64) { - for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } else { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } -} - -VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { - int i, j; +VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { + unsigned int i, j; VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP)); VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL; @@ -1129,26 +722,21 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { cm->error.setjmp = 1; - CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site), - (MAX_MVSEARCH_STEPS * 8) + 1)); - vp9_rtcd(); cpi->use_svc = 0; init_config(cpi, oxcf); - vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc); - init_pick_mode_context(cpi); + vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); cm->current_video_frame = 0; - // Set reference frame sign bias for ALTREF frame to 1 (for now) - cm->ref_frame_sign_bias[ALTREF_FRAME] = 1; - cpi->gold_is_last = 0; cpi->alt_is_last = 0; cpi->gold_is_alt = 0; + cpi->skippable_frame = 0; + // Create the encoder segmentation map and set all entries to 0 CHECK_MEM_ERROR(cm, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); @@ -1166,10 +754,6 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1)); - vpx_memset(cpi->active_map, 1, cm->MBs); - cpi->active_map_enabled = 0; - for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats, @@ -1177,23 +761,37 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } - /*Initialize the feed-forward activity masking.*/ - cpi->activity_avg = 90 << 12; - cpi->key_frame_frequency = cpi->oxcf.key_freq; - cpi->refresh_alt_ref_frame = 0; +#if CONFIG_FP_MB_STATS + cpi->use_fp_mb_stats = 0; + if (cpi->use_fp_mb_stats) { + // a place holder used to store the first pass mb stats in the first pass + CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf, + vpx_calloc(cm->MBs * sizeof(uint8_t), 1)); + } else { + cpi->twopass.frame_mb_stats_buf = NULL; + } +#endif -#if CONFIG_MULTIPLE_ARF - // Turn multiple ARF usage on/off. This is a quick hack for the initial test - // version. It should eventually be set via the codec API. - cpi->multi_arf_enabled = 1; + cpi->refresh_alt_ref_frame = 0; - if (cpi->multi_arf_enabled) { - cpi->sequence_number = 0; - cpi->frame_coding_order_period = 0; - vp9_zero(cpi->frame_coding_order); - vp9_zero(cpi->arf_buffer_idx); + // Note that at the moment multi_arf will not work with svc. + // For the current check in all the execution paths are defaulted to 0 + // pending further tuning and testing. The code is left in place here + // as a place holder in regard to the required paths. + cpi->multi_arf_last_grp_enabled = 0; + if (oxcf->pass == 2) { + if (cpi->use_svc) { + cpi->multi_arf_allowed = 0; + cpi->multi_arf_enabled = 0; + } else { + // Disable by default for now. + cpi->multi_arf_allowed = 0; + cpi->multi_arf_enabled = 0; + } + } else { + cpi->multi_arf_allowed = 0; + cpi->multi_arf_enabled = 0; } -#endif cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -1248,8 +846,10 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX]; cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); -#ifdef OUTPUT_YUV_SRC - yuv_file = fopen("bd.yuv", "ab"); +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "ab"); +#endif #endif #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); @@ -1264,9 +864,9 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - if (cpi->pass == 1) { + if (oxcf->pass == 1) { vp9_init_first_pass(cpi); - } else if (cpi->pass == 2) { + } else if (oxcf->pass == 2) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); @@ -1308,6 +908,19 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { vp9_init_second_pass_spatial_svc(cpi); } else { +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + const size_t psz = cpi->common.MBs * sizeof(uint8_t); + const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz); + + cpi->twopass.firstpass_mb_stats.mb_stats_start = + oxcf->firstpass_mb_stats_in.buf; + cpi->twopass.firstpass_mb_stats.mb_stats_end = + cpi->twopass.firstpass_mb_stats.mb_stats_start + + (ps - 1) * cpi->common.MBs * sizeof(uint8_t); + } +#endif + cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; cpi->twopass.stats_in = cpi->twopass.stats_in_start; cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1]; @@ -1318,103 +931,85 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { set_speed_features(cpi); + // Allocate memory to store variances for a frame. + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(diff))); + cpi->source_var_thresh = 0; + cpi->frames_till_next_var_check = 0; + // Default rd threshold factors for mode selection for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) - cpi->rd_thresh_freq_fact[i][j] = 32; - for (j = 0; j < MAX_REFS; ++j) - cpi->rd_thresh_freq_sub8x8[i][j] = 32; + cpi->rd.thresh_freq_fact[i][j] = 32; } -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \ - SDX3F, SDX8F, SDX4DF)\ +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \ - cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \ - cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \ cpi->fn_ptr[BT].sdx3f = SDX3F; \ cpi->fn_ptr[BT].sdx8f = SDX8F; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, vp9_variance32x16, vp9_sub_pixel_variance32x16, - vp9_sub_pixel_avg_variance32x16, NULL, NULL, - NULL, NULL, NULL, - vp9_sad32x16x4d) + vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d) BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, vp9_variance16x32, vp9_sub_pixel_variance16x32, - vp9_sub_pixel_avg_variance16x32, NULL, NULL, - NULL, NULL, NULL, - vp9_sad16x32x4d) + vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d) BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, vp9_variance64x32, vp9_sub_pixel_variance64x32, - vp9_sub_pixel_avg_variance64x32, NULL, NULL, - NULL, NULL, NULL, - vp9_sad64x32x4d) + vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d) BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, vp9_variance32x64, vp9_sub_pixel_variance32x64, - vp9_sub_pixel_avg_variance32x64, NULL, NULL, - NULL, NULL, NULL, - vp9_sad32x64x4d) + vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d) BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, vp9_variance32x32, vp9_sub_pixel_variance32x32, - vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h, - vp9_variance_halfpixvar32x32_v, - vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8, + vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8, vp9_sad32x32x4d) BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, vp9_variance64x64, vp9_sub_pixel_variance64x64, - vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h, - vp9_variance_halfpixvar64x64_v, - vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8, + vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8, vp9_sad64x64x4d) BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, vp9_variance16x16, vp9_sub_pixel_variance16x16, - vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h, - vp9_variance_halfpixvar16x16_v, - vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8, + vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8, vp9_sad16x16x4d) BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, vp9_variance16x8, vp9_sub_pixel_variance16x8, - vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL, + vp9_sub_pixel_avg_variance16x8, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, vp9_variance8x16, vp9_sub_pixel_variance8x16, - vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL, + vp9_sub_pixel_avg_variance8x16, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, vp9_variance8x8, vp9_sub_pixel_variance8x8, - vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL, + vp9_sub_pixel_avg_variance8x8, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, vp9_variance8x4, vp9_sub_pixel_variance8x4, - vp9_sub_pixel_avg_variance8x4, NULL, NULL, - NULL, NULL, vp9_sad8x4x8, - vp9_sad8x4x4d) + vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d) BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, vp9_variance4x8, vp9_sub_pixel_variance4x8, - vp9_sub_pixel_avg_variance4x8, NULL, NULL, - NULL, NULL, vp9_sad4x8x8, - vp9_sad4x8x4d) + vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d) BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, vp9_variance4x4, vp9_sub_pixel_variance4x4, - vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL, + vp9_sub_pixel_avg_variance4x4, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) cpi->full_search_sad = vp9_full_search_sad; @@ -1432,17 +1027,11 @@ VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) { cm->error.setjmp = 0; - vp9_zero(cpi->common.counts.uv_mode); - -#ifdef MODE_TEST_HIT_STATS - vp9_zero(cpi->mode_test_hits); -#endif - return cpi; } void vp9_remove_compressor(VP9_COMP *cpi) { - int i; + unsigned int i; if (!cpi) return; @@ -1453,7 +1042,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_clear_system_state(); // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count); - if (cpi->pass != 1) { + if (cpi->oxcf.pass != 1) { FILE *f = fopen("opsnr.stt", "a"); double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; @@ -1496,34 +1085,6 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #endif -#ifdef MODE_TEST_HIT_STATS - if (cpi->pass != 1) { - double norm_per_pixel_mode_tests = 0; - double norm_counts[BLOCK_SIZES]; - int i; - int sb64_per_frame; - int norm_factors[BLOCK_SIZES] = - {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1}; - FILE *f = fopen("mode_hit_stats.stt", "a"); - - // On average, how many mode tests do we do - for (i = 0; i < BLOCK_SIZES; ++i) { - norm_counts[i] = (double)cpi->mode_test_hits[i] / - (double)norm_factors[i]; - norm_per_pixel_mode_tests += norm_counts[i]; - } - // Convert to a number per 64x64 and per frame - sb64_per_frame = ((cpi->common.height + 63) / 64) * - ((cpi->common.width + 63) / 64); - norm_per_pixel_mode_tests = - norm_per_pixel_mode_tests / - (double)(cpi->common.current_video_frame * sb64_per_frame); - - fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests); - fclose(f); - } -#endif - #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); @@ -1536,9 +1097,13 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #endif } - free_pick_mode_context(&cpi->mb); +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_free(&(cpi->denoiser)); + } +#endif + dealloc_compressor_data(cpi); - vpx_free(cpi->mb.ss); vpx_free(cpi->tok); for (i = 0; i < sizeof(cpi->mbgraph_stats) / @@ -1546,11 +1111,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->mbgraph_stats[i].mb_stats); } +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + vpx_free(cpi->twopass.frame_mb_stats_buf); + cpi->twopass.frame_mb_stats_buf = NULL; + } +#endif + vp9_remove_common(&cpi->common); vpx_free(cpi); -#ifdef OUTPUT_YUV_SRC - fclose(yuv_file); +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif #endif #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); @@ -1702,16 +1276,6 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, } } -int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) { - VP9_COMMON *cm = &cpi->common; - - if (index < 0 || index >= REF_FRAMES) - return -1; - - *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf; - return 0; -} - int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); @@ -1729,34 +1293,39 @@ int vp9_update_entropy(VP9_COMP * cpi, int update) { return 0; } - -#ifdef OUTPUT_YUV_SRC -void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) { +#if CONFIG_VP9_TEMPORAL_DENOISING +#if defined(OUTPUT_YUV_DENOISED) +// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it +// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do +// not denoise the UV channels at this time. If ever we implement UV channel +// denoising we will have to modify this. +void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { uint8_t *src = s->y_buffer; int h = s->y_height; do { - fwrite(src, s->y_width, 1, yuv_file); + fwrite(src, s->y_width, 1, f); src += s->y_stride; } while (--h); src = s->u_buffer; - h = s->uv_height; + h = s->uv_height / 2; do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; + fwrite(src, s->uv_width / 2, 1, f); + src += s->uv_stride + s->uv_width / 2; } while (--h); src = s->v_buffer; - h = s->uv_height; + h = s->uv_height / 2; do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; + fwrite(src, s->uv_width / 2, 1, f); + src += s->uv_stride + s->uv_width / 2; } while (--h); } #endif +#endif #ifdef OUTPUT_YUV_REC void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { @@ -1785,111 +1354,68 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { src += s->uv_stride; } while (--h); -#if CONFIG_ALPHA - if (s->alpha_buffer) { - src = s->alpha_buffer; - h = s->alpha_height; - do { - fwrite(src, s->alpha_width, 1, yuv_rec_file); - src += s->alpha_stride; - } while (--h); - } -#endif - fflush(yuv_rec_file); } #endif -static void scale_and_extend_frame_nonnormative(YV12_BUFFER_CONFIG *src_fb, - YV12_BUFFER_CONFIG *dst_fb) { - const int in_w = src_fb->y_crop_width; - const int in_h = src_fb->y_crop_height; - const int out_w = dst_fb->y_crop_width; - const int out_h = dst_fb->y_crop_height; - const int in_w_uv = src_fb->uv_crop_width; - const int in_h_uv = src_fb->uv_crop_height; - const int out_w_uv = dst_fb->uv_crop_width; - const int out_h_uv = dst_fb->uv_crop_height; +static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t int i; - - uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer, - src_fb->alpha_buffer}; - int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride, - src_fb->alpha_stride}; - - uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer, - dst_fb->alpha_buffer}; - int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride, - dst_fb->alpha_stride}; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - if (i == 0 || i == 3) { - // Y and alpha planes - vp9_resize_plane(srcs[i], in_h, in_w, src_strides[i], - dsts[i], out_h, out_w, dst_strides[i]); - } else { - // Chroma planes - vp9_resize_plane(srcs[i], in_h_uv, in_w_uv, src_strides[i], - dsts[i], out_h_uv, out_w_uv, dst_strides[i]); - } - } - vp8_yv12_extend_frame_borders(dst_fb); -} - -static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, - YV12_BUFFER_CONFIG *dst_fb) { - const int in_w = src_fb->y_crop_width; - const int in_h = src_fb->y_crop_height; - const int out_w = dst_fb->y_crop_width; - const int out_h = dst_fb->y_crop_height; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_crop_width, src->uv_crop_width, + src->uv_crop_width }; + const int src_heights[3] = {src->y_crop_height, src->uv_crop_height, + src->uv_crop_height}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; + const int dst_widths[3] = {dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width}; + const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height}; + + for (i = 0; i < MAX_MB_PLANE; ++i) + vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i], + dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]); + + vp9_extend_frame_borders(dst); +} + +static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; + const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP); int x, y, i; - uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer, - src_fb->alpha_buffer}; - int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride, - src_fb->alpha_stride}; - - uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer, - dst_fb->alpha_buffer}; - int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride, - dst_fb->alpha_stride}; - - for (y = 0; y < out_h; y += 16) { - for (x = 0; x < out_w; x += 16) { + for (y = 0; y < dst_h; y += 16) { + for (x = 0; x < dst_w; x += 16) { for (i = 0; i < MAX_MB_PLANE; ++i) { const int factor = (i == 0 || i == 3 ? 1 : 2); - const int x_q4 = x * (16 / factor) * in_w / out_w; - const int y_q4 = y * (16 / factor) * in_h / out_h; + const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int y_q4 = y * (16 / factor) * src_h / dst_h; const int src_stride = src_strides[i]; const int dst_stride = dst_strides[i]; - uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride + - x / factor * in_w / out_w; - uint8_t *dst = dsts[i] + y / factor * dst_stride + x / factor; + const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h * + src_stride + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); - vp9_convolve8(src, src_stride, dst, dst_stride, - vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, - vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + kernel[x_q4 & 0xf], 16 * src_w / dst_w, + kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, 16 / factor); } } } - vp8_yv12_extend_frame_borders(dst_fb); -} - -static int find_fp_qindex() { - int i; - - for (i = 0; i < QINDEX_RANGE; i++) { - if (vp9_convert_qindex_to_q(i) >= 30.0) { - break; - } - } - - if (i == QINDEX_RANGE) - i--; - - return i; + vp9_extend_frame_borders(dst); } #define WRITE_RECON_BUFFER 0 @@ -1933,6 +1459,7 @@ static int recode_loop_test(const VP9_COMP *cpi, int q, int maxq, int minq) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; int force_recode = 0; // Special case trap if maximum allowed frame size exceeded. @@ -1950,10 +1477,10 @@ static int recode_loop_test(const VP9_COMP *cpi, if ((rc->projected_frame_size > high_limit && q < maxq) || (rc->projected_frame_size < low_limit && q > minq)) { force_recode = 1; - } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + } else if (cpi->oxcf.rc_mode == VPX_CQ) { // Deal with frame undershoot and whether or not we are // below the automatically set cq level. - if (q > cpi->cq_target_quality && + if (q > oxcf->cq_level && rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { force_recode = 1; } @@ -1972,23 +1499,15 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); - } -#if CONFIG_MULTIPLE_ARF - else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame && - !cpi->refresh_alt_ref_frame) { -#else - else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame && - !cpi->use_svc) { -#endif - /* Preserve the previously existing golden frame and update the frame in - * the alt ref slot instead. This is highly specific to the current use of - * alt-ref as a forward reference, and this needs to be generalized as - * other uses are implemented (like RTC/temporal scaling) - * - * The update to the buffer in the alt ref slot was signaled in - * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated - * as the golden frame next time. - */ + } else if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term in function + // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if + // we're updating the GF with the current decoded frame, we save it to the + // ARF slot instead. + // We now have to update the ARF with the current frame and swap gld_fb_idx + // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF + // slot and, if we're updating the GF, the current frame becomes the new GF. int tmp; ref_cnt_fb(cm->frame_bufs, @@ -1997,14 +1516,19 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { tmp = cpi->alt_fb_idx; cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; - } else { /* For non key/golden frames */ + + if (is_spatial_svc(cpi)) { + cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; + cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; + } + } else { /* For non key/golden frames */ if (cpi->refresh_alt_ref_frame) { int arf_idx = cpi->alt_fb_idx; -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1]; + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; } -#endif + ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); } @@ -2019,6 +1543,16 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_update_frame_info(&cpi->denoiser, + *cpi->Source, + cpi->common.frame_type, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame); + } +#endif } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { @@ -2040,7 +1574,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { } if (lf->filter_level > 0) { - vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0); + vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); } vp9_extend_frame_inner_borders(cm->frame_to_show); @@ -2049,13 +1583,15 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { void vp9_scale_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; + const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG}; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; - YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf; + const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf; - if (ref->y_crop_width != cm->width || - ref->y_crop_height != cm->height) { + // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1). + if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) && + (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) { const int new_fb = get_free_fb(cm); vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, cm->width, cm->height, @@ -2111,8 +1647,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10u %10d %10d %10d %10d %10d " - "%10"PRId64" %10"PRId64" %10d " + fprintf(f, "%10u %10d %10d %10d %10d" + "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d " "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" "%6d %6d %5d %5d %5d " "%10"PRId64" %10.3lf" @@ -2121,14 +1657,15 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { cpi->rc.projected_frame_size, cpi->rc.projected_frame_size / cpi->common.MBs, (cpi->rc.projected_frame_size - cpi->rc.this_frame_target), + cpi->rc.vbr_bits_off_target, cpi->rc.total_target_vs_actual, - (cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target), + (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target), cpi->rc.total_actual_bits, cm->base_qindex, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, + vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality), cpi->rc.avg_q, - vp9_convert_qindex_to_q(cpi->rc.ni_av_qi), - vp9_convert_qindex_to_q(cpi->cq_target_quality), + vp9_convert_qindex_to_q(cpi->oxcf.cq_level), cpi->refresh_last_frame, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost, cpi->twopass.bits_left, @@ -2159,26 +1696,11 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { #endif static void encode_without_recode_loop(VP9_COMP *cpi, - size_t *size, - uint8_t *dest, int q) { VP9_COMMON *const cm = &cpi->common; vp9_clear_system_state(); vp9_set_quantizer(cm, q); - - // Set up entropy context depending on frame type. The decoder mandates - // the use of the default context, index 0, for keyframes and inter - // frames where the error_resilient_mode or intra_only flag is set. For - // other inter-frames the encoder currently uses only two contexts; - // context 1 for ALTREF frames and context 0 for the others. - if (cm->frame_type == KEY_FRAME) { - setup_key_frame(cpi); - } else { - if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) - cm->frame_context_idx = cpi->refresh_alt_ref_frame; - - setup_inter_frame(cm); - } + setup_frame(cpi); // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. if (cpi->oxcf.aq_mode == VARIANCE_AQ) { @@ -2223,21 +1745,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, vp9_set_quantizer(cm, q); - if (loop_count == 0) { - // Set up entropy context depending on frame type. The decoder mandates - // the use of the default context, index 0, for keyframes and inter - // frames where the error_resilient_mode or intra_only flag is set. For - // other inter-frames the encoder currently uses only two contexts; - // context 1 for ALTREF frames and context 0 for the others. - if (cm->frame_type == KEY_FRAME) { - setup_key_frame(cpi); - } else { - if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) - cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; - - setup_inter_frame(cm); - } - } + if (loop_count == 0) + setup_frame(cpi); // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. @@ -2272,7 +1781,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, frame_over_shoot_limit = 1; } - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + if (cpi->oxcf.rc_mode == VPX_Q) { loop = 0; } else { if ((cm->frame_type == KEY_FRAME) && @@ -2370,7 +1879,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above // the user passsed in value. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && + if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) { q_low = q; } @@ -2431,7 +1940,8 @@ static void get_ref_frame_flags(VP9_COMP *cpi) { if (cpi->gold_is_last) cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; - if (cpi->rc.frames_till_gf_update_due == INT_MAX) + if (cpi->rc.frames_till_gf_update_due == INT_MAX && + !is_spatial_svc(cpi)) cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; if (cpi->alt_is_last) @@ -2458,6 +1968,55 @@ static void set_ext_overrides(VP9_COMP *cpi) { } } +YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { + scale_and_extend_frame_nonnormative(unscaled, scaled); + return scaled; + } else { + return unscaled; + } +} + +static void configure_skippable_frame(VP9_COMP *cpi) { + // If the current frame does not have non-zero motion vector detected in the + // first pass, and so do its previous and forward frames, then this frame + // can be skipped for partition check, and the partition size is assigned + // according to the variance + + SVC *const svc = &cpi->svc; + TWO_PASS *const twopass = is_spatial_svc(cpi) ? + &svc->layer_context[svc->spatial_layer_id].twopass + : &cpi->twopass; + + cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) && + twopass->stats_in - 2 > twopass->stats_in_start && + twopass->stats_in < twopass->stats_in_end && + (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion + == 1 && + (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion + == 1 && + twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); +} + +static void set_arf_sign_bias(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int arf_sign_bias; + + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_sign_bias = cpi->rc.source_alt_ref_active && + (!cpi->refresh_alt_ref_frame || + (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); + } else { + arf_sign_bias = + (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); + } + cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; +} + static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -2471,30 +2030,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, const SPEED_FEATURES *const sf = &cpi->sf; const unsigned int max_mv_def = MIN(cm->width, cm->height); struct segmentation *const seg = &cm->seg; - set_ext_overrides(cpi); - /* Scale the source buffer, if required. */ - if (cm->mi_cols * MI_SIZE != cpi->un_scaled_source->y_width || - cm->mi_rows * MI_SIZE != cpi->un_scaled_source->y_height) { - scale_and_extend_frame_nonnormative(cpi->un_scaled_source, - &cpi->scaled_source); - cpi->Source = &cpi->scaled_source; - } else { - cpi->Source = cpi->un_scaled_source; - } + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source); - // Scale the last source buffer, if required. - if (cpi->unscaled_last_source != NULL) { - if (cm->mi_cols * MI_SIZE != cpi->unscaled_last_source->y_width || - cm->mi_rows * MI_SIZE != cpi->unscaled_last_source->y_height) { - scale_and_extend_frame_nonnormative(cpi->unscaled_last_source, - &cpi->scaled_last_source); - cpi->Last_Source = &cpi->scaled_last_source; - } else { - cpi->Last_Source = cpi->unscaled_last_source; - } - } + if (cpi->unscaled_last_source != NULL) + cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); vp9_scale_references(cpi); @@ -2506,16 +2049,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->zbin_mode_boost = 0; cpi->zbin_mode_boost_enabled = 0; - // Current default encoder behavior for the altref sign bias. - cm->ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active; + // Set the arf sign bias for this frame. + set_arf_sign_bias(cpi); // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; // Initialize cpi->mv_step_param to default based on max resolution. - cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); + cpi->mv_step_param = vp9_init_search_range(max_mv_def); // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. - if (sf->auto_mv_step_size) { + if (sf->mv.auto_mv_step_size) { if (frame_is_intra_only(cm)) { // Initialize max_mv_magnitude for use in the first INTER frame // after a key/intra-only frame. @@ -2525,7 +2068,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Allow mv_steps to correspond to twice the max mv magnitude found // in the previous frame, capped by the default max_mv_magnitude based // on resolution. - cpi->mv_step_param = vp9_init_search_range(cpi, MIN(max_mv_def, 2 * + cpi->mv_step_param = vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); cpi->max_mv_magnitude = 0; } @@ -2533,7 +2076,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Set various flags etc to special state if it is a key frame. if (frame_is_intra_only(cm)) { - setup_key_frame(cpi); // Reset the loop filter deltas and segmentation map. vp9_reset_segment_features(&cm->seg); @@ -2551,9 +2093,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, (cpi->oxcf.frame_parallel_decoding_mode != 0); // By default, encoder assumes decoder can use prev_mi. - cm->coding_use_prev_mi = 1; if (cm->error_resilient_mode) { - cm->coding_use_prev_mi = 0; cm->frame_parallel_decoding_mode = 1; cm->reset_frame_context = 0; cm->refresh_frame_context = 0; @@ -2567,13 +2107,20 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // static regions if indicated. // Only allowed in second pass of two pass (as requires lagged coding) // and if the relevant speed feature flag is set. - if (cpi->pass == 2 && cpi->sf.static_segmentation) + if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); + // Check if the current frame is skippable for the partition search in the + // second pass according to the first pass stats + if (cpi->oxcf.pass == 2 && + (!cpi->use_svc || is_spatial_svc(cpi))) { + configure_skippable_frame(cpi); + } + // For 1 pass CBR, check if we are dropping this frame. // Never drop on key frame. - if (cpi->pass == 0 && - cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + if (cpi->oxcf.pass == 0 && + cpi->oxcf.rc_mode == VPX_CBR && cm->frame_type != KEY_FRAME) { if (vp9_rc_drop_frame(cpi)) { vp9_rc_postencode_update_drop_frame(cpi); @@ -2584,8 +2131,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_clear_system_state(); - vp9_zero(cpi->rd_tx_select_threshes); - #if CONFIG_VP9_POSTPROC if (cpi->oxcf.noise_sensitivity > 0) { int l = 0; @@ -2611,27 +2156,33 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } #endif -#ifdef OUTPUT_YUV_SRC - vp9_write_yuv_frame(cpi->Source); -#endif - set_speed_features(cpi); // Decide q and q bounds. q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index); if (!frame_is_intra_only(cm)) { - cm->interp_filter = DEFAULT_INTERP_FILTER; + cm->interp_filter = cpi->sf.default_interp_filter; /* TODO: Decide this more intelligently */ - set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH); + vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH); } if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi, size, dest, q); + encode_without_recode_loop(cpi, q); } else { encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index); } +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], + yuv_denoised_file); + } +#endif +#endif + + // Special case code to reduce pulsing when key frames are forced at a // fixed interval. Note the reconstruction error if it is the frame before // the force key frame @@ -2689,51 +2240,30 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } } -#if 0 - output_frame_level_debug_stats(cpi); -#endif if (cpi->refresh_golden_frame == 1) - cm->frame_flags |= FRAMEFLAGS_GOLDEN; + cpi->frame_flags |= FRAMEFLAGS_GOLDEN; else - cm->frame_flags &= ~FRAMEFLAGS_GOLDEN; + cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; if (cpi->refresh_alt_ref_frame == 1) - cm->frame_flags |= FRAMEFLAGS_ALTREF; + cpi->frame_flags |= FRAMEFLAGS_ALTREF; else - cm->frame_flags &= ~FRAMEFLAGS_ALTREF; + cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; get_ref_frame_flags(cpi); + cm->last_frame_type = cm->frame_type; vp9_rc_postencode_update(cpi, *size); +#if 0 + output_frame_level_debug_stats(cpi); +#endif + if (cm->frame_type == KEY_FRAME) { // Tell the caller that the frame was coded as a key frame - *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY; - -#if CONFIG_MULTIPLE_ARF - // Reset the sequence number. - if (cpi->multi_arf_enabled) { - cpi->sequence_number = 0; - cpi->frame_coding_order_period = cpi->new_frame_coding_order_period; - cpi->new_frame_coding_order_period = -1; - } -#endif + *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY; } else { - *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY; - -#if CONFIG_MULTIPLE_ARF - /* Increment position in the coded frame sequence. */ - if (cpi->multi_arf_enabled) { - ++cpi->sequence_number; - if (cpi->sequence_number >= cpi->frame_coding_order_period) { - cpi->sequence_number = 0; - cpi->frame_coding_order_period = cpi->new_frame_coding_order_period; - cpi->new_frame_coding_order_period = -1; - } - cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number]; - assert(cpi->this_frame_weight >= 0); - } -#endif + *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; } // Clear the one shot update flags for segmentation map and mode/ref loop @@ -2759,10 +2289,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->use_svc) vp9_inc_frame_in_layer(&cpi->svc); } - - // restore prev_mi - cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; - cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; } static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -2773,7 +2299,7 @@ static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { - if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->oxcf.rc_mode == VPX_CBR) { vp9_rc_get_one_pass_cbr_params(cpi); } else { vp9_rc_get_one_pass_vbr_params(cpi); @@ -2781,17 +2307,6 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, encode_frame_to_data_rate(cpi, size, dest, frame_flags); } -static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { - (void) size; - (void) dest; - (void) frame_flags; - - vp9_rc_get_first_pass_params(cpi); - vp9_set_quantizer(&cpi->common, find_fp_qindex()); - vp9_first_pass(cpi); -} - static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; @@ -2802,6 +2317,16 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, vp9_twopass_postencode_update(cpi); } +static void init_motion_estimation(VP9_COMP *cpi) { + int y_stride = cpi->scaled_source.y_stride; + + if (cpi->sf.mv.search_method == NSTEP) { + vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride); + } else if (cpi->sf.mv.search_method == DIAMOND) { + vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); + } +} + static void check_initial_width(VP9_COMP *cpi, int subsampling_x, int subsampling_y) { VP9_COMMON *const cm = &cpi->common; @@ -2809,7 +2334,13 @@ static void check_initial_width(VP9_COMP *cpi, int subsampling_x, if (!cpi->initial_width) { cm->subsampling_x = subsampling_x; cm->subsampling_y = subsampling_y; + alloc_raw_frame_buffers(cpi); + alloc_ref_frame_buffers(cpi); + alloc_util_frame_buffers(cpi); + + init_motion_estimation(cpi); + cpi->initial_width = cm->width; cpi->initial_height = cm->height; } @@ -2826,16 +2357,32 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, const int subsampling_y = sd->uv_height < sd->y_height; check_initial_width(cpi, subsampling_x, subsampling_y); + vpx_usec_timer_start(&timer); - if (vp9_lookahead_push(cpi->lookahead, - sd, time_stamp, end_time, frame_flags)) + +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) + res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time, + frame_flags); + else +#endif + res = vp9_lookahead_push(cpi->lookahead, + sd, time_stamp, end_time, frame_flags); + if (res) res = -1; vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); - if (cm->profile == PROFILE_0 && (subsampling_x != 1 || subsampling_y != 1)) { + if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && + (subsampling_x != 1 || subsampling_y != 1)) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "Non-4:2:0 color space requires profile 1 or 3"); + res = -1; + } + if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && + (subsampling_x == 1 && subsampling_y == 1)) { vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, - "Non-4:2:0 color space requires profile >= 1"); + "4:2:0 color space requires profile 0 or 2"); res = -1; } @@ -2856,13 +2403,6 @@ static int frame_is_reference(const VP9_COMP *cpi) { cm->seg.update_data; } -#if CONFIG_MULTIPLE_ARF -int is_next_frame_arf(VP9_COMP *cpi) { - // Negative entry in frame_coding_order indicates an ARF at this position. - return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0; -} -#endif - void adjust_frame_rate(VP9_COMP *cpi) { int64_t this_duration; int step = 0; @@ -2901,6 +2441,46 @@ void adjust_frame_rate(VP9_COMP *cpi) { cpi->last_end_time_stamp_seen = cpi->source->ts_end; } +// Returns 0 if this is not an alt ref else the offset of the source frame +// used as the arf midpoint. +static int get_arf_src_index(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int arf_src_index = 0; + if (is_altref_enabled(cpi)) { + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + arf_src_index = gf_group->arf_src_offset[gf_group->index]; + } + } else if (rc->source_alt_ref_pending) { + arf_src_index = rc->frames_till_gf_update_due; + } + } + return arf_src_index; +} + +static void check_src_altref(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + rc->is_src_frame_alt_ref = + (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); + } else { + rc->is_src_frame_alt_ref = cpi->alt_ref_source && + (cpi->source == cpi->alt_ref_source); + } + + if (rc->is_src_frame_alt_ref) { + // Current frame is an ARF overlay frame. + cpi->alt_ref_source = NULL; + + // Don't refresh the last buffer for an ARF overlay frame. It will + // become the GF so preserve last as an alternative prediction option. + cpi->refresh_last_frame = 0; + } +} + int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, int64_t *time_end, int flush) { @@ -2910,11 +2490,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, struct vpx_usec_timer cmptimer; YV12_BUFFER_CONFIG *force_src_buffer = NULL; MV_REFERENCE_FRAME ref_frame; + int arf_src_index; if (!cpi) return -1; - if (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2) { + if (is_spatial_svc(cpi) && cpi->oxcf.pass == 2) { +#if CONFIG_SPATIAL_SVC + vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1); +#endif vp9_restore_layer_context(cpi); } @@ -2923,7 +2507,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->source = NULL; cpi->last_source = NULL; - set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); + vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); // Normal defaults cm->reset_frame_context = 0; @@ -2932,35 +2516,38 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->refresh_golden_frame = 0; cpi->refresh_alt_ref_frame = 0; - // Should we code an alternate reference frame. - if (cpi->oxcf.play_alternate && rc->source_alt_ref_pending) { - int frames_to_arf; - -#if CONFIG_MULTIPLE_ARF - assert(!cpi->multi_arf_enabled || - cpi->frame_coding_order[cpi->sequence_number] < 0); + // Should we encode an arf frame. + arf_src_index = get_arf_src_index(cpi); + if (arf_src_index) { + assert(arf_src_index <= rc->frames_to_key); - if (cpi->multi_arf_enabled && (cpi->pass == 2)) - frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number]) - - cpi->next_frame_in_order; +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) + cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, + arf_src_index, 0); else #endif - frames_to_arf = rc->frames_till_gf_update_due; - - assert(frames_to_arf <= rc->frames_to_key); - - if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) { -#if CONFIG_MULTIPLE_ARF - cpi->alt_ref_source[cpi->arf_buffered] = cpi->source; -#else + cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index); + if (cpi->source != NULL) { cpi->alt_ref_source = cpi->source; + +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) { + int i; + // Reference a hidden frame from a lower layer + for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { + if (cpi->oxcf.ss_play_alternate[i]) { + cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx; + break; + } + } + } + cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; #endif if (cpi->oxcf.arnr_max_frames > 0) { // Produce the filtered ARF frame. - // TODO(agrange) merge these two functions. - vp9_configure_arnr_filter(cpi, frames_to_arf, rc->gfu_boost); - vp9_temporal_filter_prepare(cpi, frames_to_arf); + vp9_temporal_filter(cpi, arf_src_index); vp9_extend_frame_borders(&cpi->alt_ref_buffer); force_src_buffer = &cpi->alt_ref_buffer; } @@ -2970,59 +2557,38 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 0; rc->is_src_frame_alt_ref = 0; - -#if CONFIG_MULTIPLE_ARF - if (!cpi->multi_arf_enabled) -#endif - rc->source_alt_ref_pending = 0; + rc->source_alt_ref_pending = 0; } else { rc->source_alt_ref_pending = 0; } } if (!cpi->source) { -#if CONFIG_MULTIPLE_ARF - int i; -#endif - // Get last frame source. if (cm->current_video_frame > 0) { - if ((cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL) +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) + cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); + else +#endif + cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1); + if (cpi->last_source == NULL) return -1; } - if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) { + // Read in the source frame. +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) + cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); + else +#endif + cpi->source = vp9_lookahead_pop(cpi->lookahead, flush); + if (cpi->source != NULL) { cm->show_frame = 1; cm->intra_only = 0; -#if CONFIG_MULTIPLE_ARF - // Is this frame the ARF overlay. - rc->is_src_frame_alt_ref = 0; - for (i = 0; i < cpi->arf_buffered; ++i) { - if (cpi->source == cpi->alt_ref_source[i]) { - rc->is_src_frame_alt_ref = 1; - cpi->refresh_golden_frame = 1; - break; - } - } -#else - rc->is_src_frame_alt_ref = cpi->alt_ref_source && - (cpi->source == cpi->alt_ref_source); -#endif - if (rc->is_src_frame_alt_ref) { - // Current frame is an ARF overlay frame. -#if CONFIG_MULTIPLE_ARF - cpi->alt_ref_source[i] = NULL; -#else - cpi->alt_ref_source = NULL; -#endif - // Don't refresh the last buffer for an ARF overlay frame. It will - // become the GF so preserve last as an alternative prediction option. - cpi->refresh_last_frame = 0; - } -#if CONFIG_MULTIPLE_ARF - ++cpi->next_frame_in_order; -#endif + // Check to see if the frame should be encoded as an arf overlay. + check_src_altref(cpi); } } @@ -3030,23 +2596,20 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img; - if (cpi->last_source != NULL) { - cpi->unscaled_last_source = &cpi->last_source->img; - } else { - cpi->unscaled_last_source = NULL; - } + if (cpi->last_source != NULL) { + cpi->unscaled_last_source = &cpi->last_source->img; + } else { + cpi->unscaled_last_source = NULL; + } *time_stamp = cpi->source->ts_start; *time_end = cpi->source->ts_end; - *frame_flags = cpi->source->flags; + *frame_flags = + (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; -#if CONFIG_MULTIPLE_ARF - if (cm->frame_type != KEY_FRAME && cpi->pass == 2) - rc->source_alt_ref_pending = is_next_frame_arf(cpi); -#endif } else { *size = 0; - if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) { + if (flush && cpi->oxcf.pass == 1 && !cpi->twopass.first_pass_done) { vp9_end_first_pass(cpi); /* get last stats packet */ cpi->twopass.first_pass_done = 1; } @@ -3058,13 +2621,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->last_end_time_stamp_seen = cpi->source->ts_start; } + // Clear down mmx registers + vp9_clear_system_state(); + // adjust frame rates based on timestamps given if (cm->show_frame) { adjust_frame_rate(cpi); } if (cpi->svc.number_temporal_layers > 1 && - cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + cpi->oxcf.rc_mode == VPX_CBR) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } @@ -3072,27 +2638,31 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // start with a 0 size frame *size = 0; - // Clear down mmx registers - vp9_clear_system_state(); - /* find a free buffer for the new frame, releasing the reference previously * held. */ cm->frame_bufs[cm->new_fb_idx].ref_count--; cm->new_fb_idx = get_free_fb(cm); -#if CONFIG_MULTIPLE_ARF - /* Set up the correct ARF frame. */ - if (cpi->refresh_alt_ref_frame) { - ++cpi->arf_buffered; - } - if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && - (cpi->pass == 2)) { - cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number]; + if (!cpi->use_svc && cpi->multi_arf_allowed) { + if (cm->frame_type == KEY_FRAME) { + init_buffer_indices(cpi); + } else if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; + } } -#endif - cm->frame_flags = *frame_flags; + cpi->frame_flags = *frame_flags; + + if (cpi->oxcf.pass == 2 && + cm->current_video_frame == 0 && + cpi->oxcf.allow_spatial_resampling && + cpi->oxcf.rc_mode == VPX_VBR) { + // Internal scaling is triggered on the first frame. + vp9_set_size_literal(cpi, cpi->oxcf.scaled_frame_width, + cpi->oxcf.scaled_frame_height); + } // Reset the frame pointers to the current frame size vp9_realloc_frame_buffer(get_frame_new_buffer(cm), @@ -3100,6 +2670,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + alloc_util_frame_buffers(cpi); + init_motion_estimation(cpi); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf; @@ -3120,11 +2693,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_vaq_init(); } - if (cpi->pass == 1 && - (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { - Pass1Encode(cpi, size, dest, frame_flags); - } else if (cpi->pass == 2 && - (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { + if (cpi->oxcf.pass == 1 && + (!cpi->use_svc || is_spatial_svc(cpi))) { + const int lossless = is_lossless_requested(&cpi->oxcf); + cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4; + cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + vp9_first_pass(cpi); + } else if (cpi->oxcf.pass == 2 && + (!cpi->use_svc || is_spatial_svc(cpi))) { Pass2Encode(cpi, size, dest, frame_flags); } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); @@ -3147,20 +2723,20 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Save layer specific state. if ((cpi->svc.number_temporal_layers > 1 && - cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) || - (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) { + cpi->oxcf.rc_mode == VPX_CBR) || + (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) { vp9_save_layer_context(cpi); } vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); - if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) + if (cpi->b_calculate_psnr && cpi->oxcf.pass != 1 && cm->show_frame) generate_psnr_packet(cpi); #if CONFIG_INTERNAL_STATS - if (cpi->pass != 1) { + if (cpi->oxcf.pass != 1) { cpi->bytes += (int)(*size); if (cm->show_frame) { @@ -3184,6 +2760,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, PSNR_STATS psnr2; double frame_ssim2 = 0, weight = 0; #if CONFIG_VP9_POSTPROC + // TODO(agrange) Add resizing of post-proc buffer in here when the + // encoder is changed to use on-demand buffer allocation. vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->lf.filter_level * 10 / 6); #endif @@ -3237,6 +2815,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags) { VP9_COMMON *cm = &cpi->common; +#if !CONFIG_VP9_POSTPROC + (void)flags; +#endif if (!cm->show_frame) { return -1; @@ -3245,7 +2826,6 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, #if CONFIG_VP9_POSTPROC ret = vp9_post_proc_frame(cm, dest, flags); #else - if (cm->frame_to_show) { *dest = *cm->frame_to_show; dest->y_width = cm->width; @@ -3256,75 +2836,31 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } else { ret = -1; } - #endif // !CONFIG_VP9_POSTPROC vp9_clear_system_state(); return ret; } } -int vp9_set_roimap(VP9_COMP *cpi, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[MAX_SEGMENTS], - int delta_lf[MAX_SEGMENTS], - unsigned int threshold[MAX_SEGMENTS]) { - signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS]; - struct segmentation *seg = &cpi->common.seg; - int i; - - if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols) - return -1; - - if (!map) { - vp9_disable_segmentation(seg); - return 0; - } - - // Set the segmentation Map - vp9_set_segmentation_map(cpi, map); - - // Activate segmentation. - vp9_enable_segmentation(seg); - - // Set up the quant, LF and breakout threshold segment data - for (i = 0; i < MAX_SEGMENTS; i++) { - feature_data[SEG_LVL_ALT_Q][i] = delta_q[i]; - feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i]; - cpi->segment_encode_breakout[i] = threshold[i]; - } - - // Enable the loop and quant changes in the feature mask - for (i = 0; i < MAX_SEGMENTS; i++) { - if (delta_q[i]) - vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); - else - vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); - - if (delta_lf[i]) - vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); - else - vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); - } - - // Initialize the feature data structure - // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 - vp9_set_segment_data(seg, &feature_data[0][0], SEGMENT_DELTADATA); - - return 0; -} - -int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, - unsigned int rows, unsigned int cols) { +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) { if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { + const int mi_rows = cpi->common.mi_rows; + const int mi_cols = cpi->common.mi_cols; if (map) { - vpx_memcpy(cpi->active_map, map, rows * cols); - cpi->active_map_enabled = 1; + int r, c; + for (r = 0; r < mi_rows; r++) { + for (c = 0; c < mi_cols; c++) { + cpi->segmentation_map[r * mi_cols + c] = + !map[(r >> 1) * cols + (c >> 1)]; + } + } + vp9_enable_segfeature(&cpi->common.seg, 1, SEG_LVL_SKIP); + vp9_enable_segmentation(&cpi->common.seg); } else { - cpi->active_map_enabled = 0; + vp9_disable_segmentation(&cpi->common.seg); } - return 0; } else { - // cpi->active_map_enabled = 0; return -1; } } @@ -3343,10 +2879,11 @@ int vp9_set_internal_size(VP9_COMP *cpi, // always go to the next whole number cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs; cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs; - assert(cm->width <= cpi->initial_width); assert(cm->height <= cpi->initial_height); + update_frame_size(cpi); + return 0; } @@ -3379,10 +2916,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, printf("Warning: Desired height too large, changed to %d\n", cm->height); } } - assert(cm->width <= cpi->initial_width); assert(cm->height <= cpi->initial_height); + update_frame_size(cpi); + return 0; } @@ -3403,3 +2941,42 @@ int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { int vp9_get_quantizer(VP9_COMP *cpi) { return cpi->common.base_qindex; } + +void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { + if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF)) { + int ref = 7; + + if (flags & VP8_EFLAG_NO_REF_LAST) + ref ^= VP9_LAST_FLAG; + + if (flags & VP8_EFLAG_NO_REF_GF) + ref ^= VP9_GOLD_FLAG; + + if (flags & VP8_EFLAG_NO_REF_ARF) + ref ^= VP9_ALT_FLAG; + + vp9_use_as_reference(cpi, ref); + } + + if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF | + VP8_EFLAG_FORCE_ARF)) { + int upd = 7; + + if (flags & VP8_EFLAG_NO_UPD_LAST) + upd ^= VP9_LAST_FLAG; + + if (flags & VP8_EFLAG_NO_UPD_GF) + upd ^= VP9_GOLD_FLAG; + + if (flags & VP8_EFLAG_NO_UPD_ARF) + upd ^= VP9_ALT_FLAG; + + vp9_update_reference(cpi, upd); + } + + if (flags & VP8_EFLAG_NO_UPD_ENTROPY) { + vp9_update_entropy(cpi, 0); + } +} diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_encoder.h index e30fb02b2..c841da267 100644 --- a/libvpx/vp9/encoder/vp9_onyx_int.h +++ b/libvpx/vp9/encoder/vp9_encoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ONYX_INT_H_ -#define VP9_ENCODER_VP9_ONYX_INT_H_ +#ifndef VP9_ENCODER_VP9_ENCODER_H_ +#define VP9_ENCODER_VP9_ENCODER_H_ #include <stdio.h> @@ -24,6 +24,7 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_lookahead.h" @@ -31,22 +32,21 @@ #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_speed_features.h" #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_variance.h" +#if CONFIG_VP9_TEMPORAL_DENOISING +#include "vp9/encoder/vp9_denoiser.h" +#endif #ifdef __cplusplus extern "C" { #endif -// #define MODE_TEST_HIT_STATS - #define DEFAULT_GF_INTERVAL 10 -#define MAX_MODES 30 -#define MAX_REFS 6 - typedef struct { int nmvjointcost[MV_JOINTS]; int nmvcosts[2][MV_VALS]; @@ -64,56 +64,6 @@ typedef struct { FRAME_CONTEXT fc; } CODING_CONTEXT; -// This enumerator type needs to be kept aligned with the mode order in -// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. -typedef enum { - THR_NEARESTMV, - THR_NEARESTA, - THR_NEARESTG, - - THR_DC, - - THR_NEWMV, - THR_NEWA, - THR_NEWG, - - THR_NEARMV, - THR_NEARA, - THR_COMP_NEARESTLA, - THR_COMP_NEARESTGA, - - THR_TM, - - THR_COMP_NEARLA, - THR_COMP_NEWLA, - THR_NEARG, - THR_COMP_NEARGA, - THR_COMP_NEWGA, - - THR_ZEROMV, - THR_ZEROG, - THR_ZEROA, - THR_COMP_ZEROLA, - THR_COMP_ZEROGA, - - THR_H_PRED, - THR_V_PRED, - THR_D135_PRED, - THR_D207_PRED, - THR_D153_PRED, - THR_D63_PRED, - THR_D117_PRED, - THR_D45_PRED, -} THR_MODES; - -typedef enum { - THR_LAST, - THR_GOLD, - THR_ALTR, - THR_COMP_LA, - THR_COMP_GA, - THR_INTRA, -} THR_MODES_SUB8X8; typedef enum { // encode_breakout is disabled. @@ -132,43 +82,36 @@ typedef enum { } VPX_SCALING; typedef enum { - USAGE_LOCAL_FILE_PLAYBACK = 0, - USAGE_STREAM_FROM_SERVER = 1, - USAGE_CONSTRAINED_QUALITY = 2, - USAGE_CONSTANT_QUALITY = 3, -} END_USAGE; - -typedef enum { // Good Quality Fast Encoding. The encoder balances quality with the // amount of time it takes to encode the output. (speed setting // controls how fast) - MODE_GOODQUALITY = 1, + ONE_PASS_GOOD = 1, // One Pass - Best Quality. The encoder places priority on the // quality of the output over encoding speed. The output is compressed // at the highest possible quality. This option takes the longest // amount of time to encode. (speed setting ignored) - MODE_BESTQUALITY = 2, + ONE_PASS_BEST = 2, // Two Pass - First Pass. The encoder generates a file of statistics // for use in the second encoding pass. (speed setting controls how fast) - MODE_FIRSTPASS = 3, + TWO_PASS_FIRST = 3, // Two Pass - Second Pass. The encoder uses the statistics that were // generated in the first encoding pass to create the compressed // output. (speed setting controls how fast) - MODE_SECONDPASS = 4, + TWO_PASS_SECOND_GOOD = 4, // Two Pass - Second Pass Best. The encoder uses the statistics that // were generated in the first encoding pass to create the compressed // output using the highest possible quality, and taking a // longer amount of time to encode. (speed setting ignored) - MODE_SECONDPASS_BEST = 5, + TWO_PASS_SECOND_BEST = 5, // Realtime/Live Encoding. This mode is optimized for realtime // encoding (for example, capturing a television signal or feed from // a live camera). (speed setting controls how fast) - MODE_REALTIME = 6, + REALTIME = 6, } MODE; typedef enum { @@ -185,7 +128,8 @@ typedef enum { AQ_MODE_COUNT // This should always be the last member of the enum } AQ_MODE; -typedef struct VP9_CONFIG { + +typedef struct VP9EncoderConfig { BITSTREAM_PROFILE profile; BIT_DEPTH bit_depth; int width; // width of data passed to the compressor @@ -195,10 +139,11 @@ typedef struct VP9_CONFIG { int noise_sensitivity; // pre processing blur: recommendation 0 int sharpness; // sharpening output: recommendation 0: - int cpu_used; + int speed; unsigned int rc_max_intra_bitrate_pct; MODE mode; + int pass; // Key Framing Operations int auto_key; // autodetect cut scenes and set the keyframes @@ -209,16 +154,17 @@ typedef struct VP9_CONFIG { // ---------------------------------------------------------------- // DATARATE CONTROL OPTIONS - END_USAGE end_usage; // vbr or cbr + // vbr, cbr, constrained quality or constant quality + enum vpx_rc_mode rc_mode; // buffer targeting aggressiveness int under_shoot_pct; int over_shoot_pct; // buffering parameters - int64_t starting_buffer_level; // in seconds - int64_t optimal_buffer_level; - int64_t maximum_buffer_size; + int64_t starting_buffer_level_ms; + int64_t optimal_buffer_level_ms; + int64_t maximum_buffer_size_ms; // Frame drop threshold. int drop_frames_water_mark; @@ -228,9 +174,13 @@ typedef struct VP9_CONFIG { int worst_allowed_q; int best_allowed_q; int cq_level; - int lossless; AQ_MODE aq_mode; // Adaptive Quantization mode + // Internal frame size scaling. + int allow_spatial_resampling; + int scaled_frame_width; + int scaled_frame_height; + // Enable feature to reduce the frame quantization every x frames. int frame_periodic_boost; @@ -246,13 +196,13 @@ typedef struct VP9_CONFIG { int ts_number_layers; // Number of temporal layers. // Bitrate allocation for spatial layers. int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + int ss_play_alternate[VPX_SS_MAX_LAYERS]; // Bitrate allocation (CBR mode) and framerate factor, for temporal layers. int ts_target_bitrate[VPX_TS_MAX_LAYERS]; int ts_rate_decimator[VPX_TS_MAX_LAYERS]; // these parameters aren't to be used in final build don't use!!! int play_alternate; - int alt_freq; int encode_breakout; // early breakout : for video conf recommend 800 @@ -278,21 +228,30 @@ typedef struct VP9_CONFIG { struct vpx_fixed_buf two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; +#if CONFIG_FP_MB_STATS + struct vpx_fixed_buf firstpass_mb_stats_in; +#endif + vp8e_tuning tuning; -} VP9_CONFIG; + vp9e_tune_content content; +} VP9EncoderConfig; + +static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { + return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; +} + +static INLINE int is_best_mode(MODE mode) { + return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST; +} typedef struct VP9_COMP { QUANTS quants; MACROBLOCK mb; VP9_COMMON common; - VP9_CONFIG oxcf; + VP9EncoderConfig oxcf; struct lookahead_ctx *lookahead; struct lookahead_entry *source; -#if CONFIG_MULTIPLE_ARF - struct lookahead_entry *alt_ref_source[REF_FRAMES]; -#else struct lookahead_entry *alt_ref_source; -#endif struct lookahead_entry *last_source; YV12_BUFFER_CONFIG *Source; @@ -302,20 +261,17 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG *unscaled_last_source; YV12_BUFFER_CONFIG scaled_last_source; - int key_frame_frequency; - int gold_is_last; // gold same as last frame ( short circuit gold searches) int alt_is_last; // Alt same as last ( short circuit altref search) int gold_is_alt; // don't do both alt and gold search ( just do gold). + int skippable_frame; + int scaled_ref_idx[3]; int lst_fb_idx; int gld_fb_idx; int alt_fb_idx; -#if CONFIG_MULTIPLE_ARF - int alt_ref_fb_idx[REF_FRAMES - 3]; -#endif int refresh_last_frame; int refresh_golden_frame; int refresh_alt_ref_frame; @@ -333,41 +289,10 @@ typedef struct VP9_COMP { TOKENEXTRA *tok; unsigned int tok_count[4][1 << 6]; -#if CONFIG_MULTIPLE_ARF - // Position within a frame coding order (including any additional ARF frames). - unsigned int sequence_number; - // Next frame in naturally occurring order that has not yet been coded. - int next_frame_in_order; -#endif - // Ambient reconstruction err target for force key frames int ambient_err; - // Thresh_mult is used to set a threshold for the rd score. A higher value - // means that we will accept the best mode so far more often. This number - // is used in combination with the current block size, and thresh_freq_fact - // to pick a threshold. - int rd_thresh_mult[MAX_MODES]; - int rd_thresh_mult_sub8x8[MAX_REFS]; - - int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; - int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; - int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS]; - int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS]; - - int64_t rd_comp_pred_diff[REFERENCE_MODES]; - int64_t rd_prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; - int64_t rd_tx_select_diff[TX_MODES]; - // FIXME(rbultje) can this overflow? - int rd_tx_select_threshes[MAX_REF_FRAMES][TX_MODES]; - - int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - int64_t rd_filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; - int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS]; - int64_t mask_filter_rd; - - int RDMULT; - int RDDIV; + RD_OPT rd; CODING_CONTEXT coding_context; @@ -376,30 +301,19 @@ typedef struct VP9_COMP { int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames int active_arnr_strength; // <= cpi->oxcf.arnr_max_strength - double output_framerate; int64_t last_time_stamp_seen; int64_t last_end_time_stamp_seen; int64_t first_time_stamp_ever; RATE_CONTROL rc; - int cq_target_quality; - vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; - vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES]; struct vpx_codec_pkt_list *output_pkt_list; MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; int mbgraph_n_frames; // number of frames filled in the above int static_mb_pct; // % forced skip mbs by segmentation - - // for real time encoding - int speed; - - int cpu_used; - int pass; - int ref_frame_flags; SPEED_FEATURES sf; @@ -410,8 +324,8 @@ typedef struct VP9_COMP { // Default value is 1. From first pass stats, encode_breakout may be disabled. ENCODE_BREAKOUT_TYPE allow_encode_breakout; - // Get threshold from external input. In real time mode, it can be - // overwritten according to encoding speed. + // Get threshold from external input. A suggested threshold is 800 for HD + // clips, and 300 for < HD clips. int encode_breakout; unsigned char *segmentation_map; @@ -421,13 +335,9 @@ typedef struct VP9_COMP { unsigned char *complexity_map; - unsigned char *active_map; - unsigned int active_map_enabled; - CYCLIC_REFRESH *cyclic_refresh; fractional_mv_step_fp *find_fractional_mv_step; - fractional_mv_step_comp_fp *find_fractional_mv_step_comp; vp9_full_search_fn_t full_search_sad; vp9_refining_search_fn_t refining_search_sad; vp9_diamond_search_fn_t diamond_search_sad; @@ -437,11 +347,14 @@ typedef struct VP9_COMP { uint64_t time_pick_lpf; uint64_t time_encode_sb_row; - struct twopass_rc twopass; +#if CONFIG_FP_MB_STATS + int use_fp_mb_stats; +#endif + + TWO_PASS twopass; YV12_BUFFER_CONFIG alt_ref_buffer; YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; - int fixed_divide[512]; #if CONFIG_INTERNAL_STATS unsigned int mode_chosen_counts[MAX_MODES]; @@ -478,11 +391,6 @@ typedef struct VP9_COMP { #endif int b_calculate_psnr; - // Per MB activity measurement - unsigned int activity_avg; - unsigned int *mb_activity_map; - int *mb_norm_activity_map; - int droppable; int dummy_packing; /* flag to indicate if packing is dummy */ @@ -496,33 +404,42 @@ typedef struct VP9_COMP { SVC svc; - int use_large_partition_rate; + // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. + diff *source_diff_var; + // The threshold used in SOURCE_VAR_BASED_PARTITION search type. + unsigned int source_var_thresh; + int frames_till_next_var_check; + + int frame_flags; + + search_site_config ss_cfg; + + int mbmode_cost[INTRA_MODES]; + unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; + int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES]; + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + + PICK_MODE_CONTEXT *leaf_tree; + PC_TREE *pc_tree; + PC_TREE *pc_root; + int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; -#if CONFIG_MULTIPLE_ARF - // ARF tracking variables. + int multi_arf_allowed; int multi_arf_enabled; - unsigned int frame_coding_order_period; - unsigned int new_frame_coding_order_period; - int frame_coding_order[MAX_LAG_BUFFERS * 2]; - int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2]; - int arf_weight[MAX_LAG_BUFFERS]; - int arf_buffered; - int this_frame_weight; - int max_arf_level; -#endif + int multi_arf_last_grp_enabled; -#ifdef MODE_TEST_HIT_STATS - // Debug / test stats - int64_t mode_test_hits[BLOCK_SIZES]; +#if CONFIG_VP9_TEMPORAL_DENOISING + VP9_DENOISER denoiser; #endif } VP9_COMP; void vp9_initialize_enc(); -struct VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf); +struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf); void vp9_remove_compressor(VP9_COMP *cpi); -void vp9_change_config(VP9_COMP *cpi, const VP9_CONFIG *oxcf); +void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); // receive a frames worth of data. caller can assume that a copy of this // frame is made and not just a copy of the pointer.. @@ -544,22 +461,12 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags); int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp9_get_reference_enc(VP9_COMP *cpi, int index, - YV12_BUFFER_CONFIG **fb); - int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); int vp9_update_entropy(VP9_COMP *cpi, int update); -int vp9_set_roimap(VP9_COMP *cpi, unsigned char *map, - unsigned int rows, unsigned int cols, - int delta_q[MAX_SEGMENTS], - int delta_lf[MAX_SEGMENTS], - unsigned int threshold[MAX_SEGMENTS]); - -int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, - unsigned int rows, unsigned int cols); +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); @@ -591,15 +498,15 @@ static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality -static INLINE int vp9_frame_is_boosted(const VP9_COMP *cpi) { +static INLINE int frame_is_boosted(const VP9_COMP *cpi) { return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || - (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) || + vp9_is_upper_layer_key_frame(cpi); } static INLINE int get_token_alloc(int mb_rows, int mb_cols) { - // TODO(JBB): make this work for alpha channel and double check we can't - // exceed this token count if we have a 32x32 transform crossing a boundary - // at a multiple of 16. + // TODO(JBB): double check we can't exceed this token count if we have a + // 32x32 transform crossing a boundary at a multiple of 16. // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full // resolution. We assume up to 1 token per pixel, and then allow // a head room of 4. @@ -614,10 +521,29 @@ void vp9_scale_references(VP9_COMP *cpi); void vp9_update_reference_frames(VP9_COMP *cpi); -extern const int q_trans[]; - int64_t vp9_rescale(int64_t val, int64_t num, int denom); +void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); + +YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled); + +void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); + +static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) { + return cpi->use_svc && + cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers > 1; +} + +static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { + return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 && + (cpi->oxcf.play_alternate && + (!is_spatial_svc(cpi) || + cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id])); +} + static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { @@ -627,8 +553,12 @@ static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, : 0]; } +static INLINE int get_chessboard_index(const int frame_index) { + return frame_index & 0x1; +} + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ONYX_INT_H_ +#endif // VP9_ENCODER_VP9_ENCODER_H_ diff --git a/libvpx/vp9/encoder/vp9_extend.c b/libvpx/vp9/encoder/vp9_extend.c index dcbb5ac35..e8517c889 100644 --- a/libvpx/vp9/encoder/vp9_extend.c +++ b/libvpx/vp9/encoder/vp9_extend.c @@ -75,18 +75,6 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; -#if CONFIG_ALPHA - const int et_a = dst->border >> (dst->alpha_height != dst->y_height); - const int el_a = dst->border >> (dst->alpha_width != dst->y_width); - const int eb_a = et_a + dst->alpha_height - src->alpha_height; - const int er_a = el_a + dst->alpha_width - src->alpha_width; - - copy_and_extend_plane(src->alpha_buffer, src->alpha_stride, - dst->alpha_buffer, dst->alpha_stride, - src->alpha_width, src->alpha_height, - et_a, el_a, eb_a, er_a); -#endif - copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_width, src->y_height, diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index db32ef8c9..295e43777 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -22,19 +22,17 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" // vp9_setup_dst_planes() #include "vp9/common/vp9_systemdependent.h" - #include "vp9/encoder/vp9_aq_variance.h" #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_variance.h" #define OUTPUT_FPF 0 @@ -46,6 +44,9 @@ #define GF_RMAX 96.0 #define ERR_DIVISOR 150.0 #define MIN_DECAY_FACTOR 0.1 +#define SVC_FACTOR_PT_LOW 0.45 +#define FACTOR_PT_LOW 0.5 +#define FACTOR_PT_HIGH 0.9 #define KF_MB_INTRA_MIN 150 #define GF_MB_INTRA_MIN 100 @@ -53,15 +54,7 @@ #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001) #define MIN_KF_BOOST 300 - -#if CONFIG_MULTIPLE_ARF -// Set MIN_GF_INTERVAL to 1 for the full decomposition. -#define MIN_GF_INTERVAL 2 -#else -#define MIN_GF_INTERVAL 4 -#endif - -#define DISABLE_RC_LONG_TERM_MEM +#define MIN_GF_INTERVAL 4 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { YV12_BUFFER_CONFIG temp = *a; @@ -78,12 +71,12 @@ static int gfboost_qadjust(int qindex) { // Resets the first pass file to the given position using a relative seek from // the current position. -static void reset_fpf_position(struct twopass_rc *p, +static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) { p->stats_in = position; } -static int lookup_next_frame_stats(const struct twopass_rc *p, +static int lookup_next_frame_stats(const TWO_PASS *p, FIRSTPASS_STATS *next_frame) { if (p->stats_in >= p->stats_in_end) return EOF; @@ -94,24 +87,16 @@ static int lookup_next_frame_stats(const struct twopass_rc *p, // Read frame stats at an offset from the current position. -static int read_frame_stats(const struct twopass_rc *p, - FIRSTPASS_STATS *frame_stats, int offset) { - const FIRSTPASS_STATS *fps_ptr = p->stats_in; - - // Check legality of offset. - if (offset >= 0) { - if (&fps_ptr[offset] >= p->stats_in_end) - return EOF; - } else if (offset < 0) { - if (&fps_ptr[offset] < p->stats_in_start) - return EOF; +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { + if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || + (offset < 0 && p->stats_in + offset < p->stats_in_start)) { + return NULL; } - *frame_stats = fps_ptr[offset]; - return 1; + return &p->stats_in[offset]; } -static int input_stats(struct twopass_rc *p, FIRSTPASS_STATS *fps) { +static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { if (p->stats_in >= p->stats_in_end) return EOF; @@ -134,14 +119,13 @@ static void output_stats(FIRSTPASS_STATS *stats, FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); - fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f" + fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f" "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n", stats->frame, stats->intra_error, stats->coded_error, stats->sr_coded_error, - stats->ssim_weighted_pred_err, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, @@ -161,12 +145,22 @@ static void output_stats(FIRSTPASS_STATS *stats, #endif } +#if CONFIG_FP_MB_STATS +static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm, + struct vpx_codec_pkt_list *pktlist) { + struct vpx_codec_cx_pkt pkt; + pkt.kind = VPX_CODEC_FPMB_STATS_PKT; + pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; + pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t); + vpx_codec_pkt_list_add(pktlist, &pkt); +} +#endif + static void zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->intra_error = 0.0; section->coded_error = 0.0; section->sr_coded_error = 0.0; - section->ssim_weighted_pred_err = 0.0; section->pcnt_inter = 0.0; section->pcnt_motion = 0.0; section->pcnt_second_ref = 0.0; @@ -191,7 +185,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->intra_error += frame->intra_error; section->coded_error += frame->coded_error; section->sr_coded_error += frame->sr_coded_error; - section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err; section->pcnt_inter += frame->pcnt_inter; section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; @@ -214,7 +207,6 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->intra_error -= frame->intra_error; section->coded_error -= frame->coded_error; section->sr_coded_error -= frame->sr_coded_error; - section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err; section->pcnt_inter -= frame->pcnt_inter; section->pcnt_motion -= frame->pcnt_motion; section->pcnt_second_ref -= frame->pcnt_second_ref; @@ -231,113 +223,25 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->duration -= frame->duration; } -static void avg_stats(FIRSTPASS_STATS *section) { - if (section->count < 1.0) - return; - - section->intra_error /= section->count; - section->coded_error /= section->count; - section->sr_coded_error /= section->count; - section->ssim_weighted_pred_err /= section->count; - section->pcnt_inter /= section->count; - section->pcnt_second_ref /= section->count; - section->pcnt_neutral /= section->count; - section->pcnt_motion /= section->count; - section->MVr /= section->count; - section->mvr_abs /= section->count; - section->MVc /= section->count; - section->mvc_abs /= section->count; - section->MVrv /= section->count; - section->MVcv /= section->count; - section->mv_in_out_count /= section->count; - section->duration /= section->count; -} // Calculate a modified Error used in distributing bits between easier and // harder frames. -static double calculate_modified_err(const VP9_COMP *cpi, +static double calculate_modified_err(const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, const FIRSTPASS_STATS *this_frame) { - const struct twopass_rc *twopass = &cpi->twopass; - const SVC *const svc = &cpi->svc; - const FIRSTPASS_STATS *stats; - double av_err; - double modified_error; - - if (svc->number_spatial_layers > 1 && - svc->number_temporal_layers == 1) { - twopass = &svc->layer_context[svc->spatial_layer_id].twopass; - } - - stats = &twopass->total_stats; - av_err = stats->ssim_weighted_pred_err / stats->count; - modified_error = av_err * pow(this_frame->ssim_weighted_pred_err / - DOUBLE_DIVIDE_CHECK(av_err), - cpi->oxcf.two_pass_vbrbias / 100.0); - + const FIRSTPASS_STATS *const stats = &twopass->total_stats; + const double av_err = stats->coded_error / stats->count; + const double modified_error = av_err * + pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); return fclamp(modified_error, twopass->modified_error_min, twopass->modified_error_max); } -static const double weight_table[256] = { - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500, - 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250, - 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000, - 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750, - 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, - 0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000 -}; - -static double simple_weight(const YV12_BUFFER_CONFIG *buf) { - int i, j; - double sum = 0.0; - const int w = buf->y_crop_width; - const int h = buf->y_crop_height; - const uint8_t *row = buf->y_buffer; - - for (i = 0; i < h; ++i) { - const uint8_t *pixel = row; - for (j = 0; j < w; ++j) - sum += weight_table[*pixel++]; - row += buf->y_stride; - } - - return MAX(0.1, sum / (w * h)); -} - // This function returns the maximum target rate per frame. -static int frame_max_bits(const RATE_CONTROL *rc, const VP9_CONFIG *oxcf) { - int64_t max_bits = ((int64_t)rc->av_per_frame_bandwidth * +static int frame_max_bits(const RATE_CONTROL *rc, + const VP9EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * (int64_t)oxcf->two_pass_vbrmax_section) / 100; if (max_bits < 0) max_bits = 0; @@ -352,7 +256,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + if (is_spatial_svc(cpi)) { int i; for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { output_stats(&cpi->svc.layer_context[i].twopass.total_stats, @@ -376,37 +280,40 @@ static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { } } -static unsigned int zz_motion_search(const MACROBLOCK *x) { - const MACROBLOCKD *const xd = &x->e_mbd; - const uint8_t *const src = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - const uint8_t *const ref = xd->plane[0].pre[0].buf; - const int ref_stride = xd->plane[0].pre[0].stride; +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { unsigned int sse; - vp9_variance_fn_t fn = get_block_variance_fn(xd->mi[0]->mbmi.sb_type); - fn(src, src_stride, ref, ref_stride, &sse); + const vp9_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); return sse; } +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(const VP9_COMMON *cm) { + int sr = 0; + const int dim = MIN(cm->width, cm->height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) + ++sr; + return sr; +} + static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const MV *ref_mv, MV *best_mv, int *best_motion_err) { MACROBLOCKD *const xd = &x->e_mbd; MV tmp_mv = {0, 0}; MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3}; - int num00, tmp_err, n, sr = 0; - int step_param = 3; - int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + int num00, tmp_err, n; const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; - int new_mv_mode_penalty = 256; - const int quart_frm = MIN(cpi->common.width, cpi->common.height); - - // Refine the motion search range according to the frame dimension - // for first pass test. - while ((quart_frm << sr) < MAX_FULL_PEL_VAL) - ++sr; + const int new_mv_mode_penalty = 256; + int step_param = 3; + int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + const int sr = get_search_range(&cpi->common); step_param += sr; further_steps -= sr; @@ -414,11 +321,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, v_fn_ptr.vf = get_block_variance_fn(bsize); // Center the initial step/diamond search on best mv. - tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, step_param, - x->sadperbit16, &num00, &v_fn_ptr, - x->nmvjointcost, - x->mvcost, ref_mv); + x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) @@ -426,8 +331,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (tmp_err < *best_motion_err) { *best_motion_err = tmp_err; - best_mv->row = tmp_mv.row; - best_mv->col = tmp_mv.col; + *best_mv = tmp_mv; } // Carry out further step/diamond searches as necessary. @@ -440,11 +344,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (num00) { --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, step_param + n, x->sadperbit16, - &num00, &v_fn_ptr, - x->nmvjointcost, - x->mvcost, ref_mv); + &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) @@ -452,8 +354,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (tmp_err < *best_motion_err) { *best_motion_err = tmp_err; - best_mv->row = tmp_mv.row; - best_mv->col = tmp_mv.col; + *best_mv = tmp_mv; } } } @@ -469,6 +370,32 @@ static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) { } } +static int find_fp_qindex() { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i) >= 30.0) + break; + + if (i == QINDEX_RANGE) + i--; + + return i; +} + +static void set_first_pass_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || + (cpi->frame_flags & FRAMEFLAGS_KEY))) { + cm->frame_type = KEY_FRAME; + } else { + cm->frame_type = INTER_FRAME; + } + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; +} + void vp9_first_pass(VP9_COMP *cpi) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->mb; @@ -477,7 +404,7 @@ void vp9_first_pass(VP9_COMP *cpi) { TileInfo tile; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - const PICK_MODE_CONTEXT *ctx = &x->sb64_context; + const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none; int i; int recon_yoffset, recon_uvoffset; @@ -502,50 +429,71 @@ void vp9_first_pass(VP9_COMP *cpi) { int new_mv_count = 0; int sum_in_vectors = 0; uint32_t lastmv_as_int = 0; - struct twopass_rc *twopass = &cpi->twopass; + TWO_PASS *twopass = &cpi->twopass; const MV zero_mv = {0, 0}; const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs); + } +#endif + vp9_clear_system_state(); - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + set_first_pass_params(cpi); + vp9_set_quantizer(cm, find_fp_qindex()); + + if (is_spatial_svc(cpi)) { MV_REFERENCE_FRAME ref_frame = LAST_FRAME; const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL; twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass; + if (cpi->common.current_video_frame == 0) { + cpi->ref_frame_flags = 0; + } else { + LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + if (lc->current_video_frame_in_layer == 0) + cpi->ref_frame_flags = VP9_GOLD_FLAG; + else + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + vp9_scale_references(cpi); // Use either last frame or alt frame for motion search. if (cpi->ref_frame_flags & VP9_LAST_FLAG) { scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); ref_frame = LAST_FRAME; - } else if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, ALTREF_FRAME); - ref_frame = ALTREF_FRAME; + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + ref_frame = GOLDEN_FRAME; } - if (scaled_ref_buf != NULL) { - // Update the stride since we are using scaled reference buffer + if (scaled_ref_buf != NULL) first_ref_buf = scaled_ref_buf; - recon_y_stride = first_ref_buf->y_stride; - recon_uv_stride = first_ref_buf->uv_stride; - uv_mb_height = 16 >> (first_ref_buf->y_height > first_ref_buf->uv_height); - } + + recon_y_stride = new_yv12->y_stride; + recon_uv_stride = new_yv12->uv_stride; + uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); // Disable golden frame for svc first pass for now. gld_yv12 = NULL; set_ref_ptrs(cm, xd, ref_frame, NONE); + + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source); } + vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + vp9_setup_src_planes(x, cpi->Source, 0, 0); vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); - vp9_setup_dst_planes(xd, new_yv12, 0, 0); + vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0); xd->mi = cm->mi_grid_visible; xd->mi[0] = cm->mi; - vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); - vp9_frame_init_quantizer(cpi); for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -583,6 +531,9 @@ void vp9_first_pass(VP9_COMP *cpi) { const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); double error_weight = 1.0; const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif vp9_clear_system_state(); @@ -603,7 +554,13 @@ void vp9_first_pass(VP9_COMP *cpi) { } // Do intra 16x16 prediction. - this_error = vp9_encode_intra(x, use_dc_pred); + x->skip_encode = 0; + xd->mi[0]->mbmi.mode = DC_PRED; + xd->mi[0]->mbmi.tx_size = use_dc_pred ? + (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + vp9_encode_intra_block_plane(x, bsize, 0); + this_error = vp9_get_mb_ss(x->plane[0].src_diff); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); this_error = (int)(this_error * error_weight); @@ -621,6 +578,13 @@ void vp9_first_pass(VP9_COMP *cpi) { // Accumulate the intra error. intra_error += (int64_t)this_error; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + } +#endif + // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); @@ -628,77 +592,109 @@ void vp9_first_pass(VP9_COMP *cpi) { // Other than for the first frame do a motion search. if (cm->current_video_frame > 0) { - int tmp_err, motion_error; + int tmp_err, motion_error, raw_motion_error; int_mv mv, tmp_mv; + struct buf_2d unscaled_last_source_buf_2d; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - motion_error = zz_motion_search(x); + motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); // Assume 0,0 motion with no mv overhead. mv.as_int = tmp_mv.as_int = 0; - // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, - &motion_error); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); - motion_error = (int)(motion_error * error_weight); - } - - // If the current best reference mv is not centered on 0,0 then do a 0,0 - // based search as well. - if (best_ref_mv.as_int) { - tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, - &tmp_err); + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = + cpi->unscaled_last_source->y_stride; + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > 25 || is_spatial_svc(cpi)) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, + &motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); - tmp_err = (int)(tmp_err * error_weight); + motion_error = (int)(motion_error * error_weight); } - if (tmp_err < motion_error) { - motion_error = tmp_err; - mv.as_int = tmp_mv.as_int; + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (best_ref_mv.as_int) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_clear_system_state(); + tmp_err = (int)(tmp_err * error_weight); + } + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv.as_int = tmp_mv.as_int; + } } - } - // Search in an older reference frame. - if (cm->current_video_frame > 1 && gld_yv12 != NULL) { - // Assume 0,0 motion with no mv overhead. - int gf_motion_error; + // Search in an older reference frame. + if (cm->current_video_frame > 1 && gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; - xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; - gf_motion_error = zz_motion_search(x); + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, - &gf_motion_error); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); - gf_motion_error = (int)(gf_motion_error * error_weight); - } + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, + &gf_motion_error); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_clear_system_state(); + gf_motion_error = (int)(gf_motion_error * error_weight); + } - if (gf_motion_error < motion_error && gf_motion_error < this_error) - ++second_ref_count; - - // Reset to last frame as reference buffer. - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; - xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; - - // In accumulating a score for the older reference frame take the - // best of the motion predicted score and the intra coded error - // (just as will be done for) accumulation of "coded_error" for - // the last frame. - if (gf_motion_error < this_error) - sr_coded_error += gf_motion_error; - else - sr_coded_error += this_error; + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++second_ref_count; + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + sr_coded_error += gf_motion_error; + else + sr_coded_error += this_error; + } else { + sr_coded_error += motion_error; + } } else { sr_coded_error += motion_error; } + // Start by assuming that intra mode is best. best_ref_mv.as_int = 0; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif + if (motion_error <= this_error) { // Keep a count of cases where the inter and intra were very close // and very low. This helps with scene cut detection for example in @@ -727,9 +723,52 @@ void vp9_first_pass(VP9_COMP *cpi) { best_ref_mv.as_int = mv.as_int; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // inter predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_SMALL_MASK; + } + } +#endif + if (mv.as_int) { ++mvcount; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_MOTION_ZERO_MASK; + // check estimated motion direction + if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { + // right direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_RIGHT_MASK; + } else if (mv.as_mv.row < 0 && + abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { + // up direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_UP_MASK; + } else if (mv.as_mv.col < 0 && + abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { + // left direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_LEFT_MASK; + } else { + // down direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_DOWN_MASK; + } + } +#endif + // Non-zero vector, was it different from the last non zero vector? if (mv.as_int != lastmv_as_int) ++new_mv_count; @@ -795,7 +834,6 @@ void vp9_first_pass(VP9_COMP *cpi) { fps.intra_error = (double)(intra_error >> 8); fps.coded_error = (double)(coded_error >> 8); fps.sr_coded_error = (double)(sr_coded_error >> 8); - fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source); fps.count = 1.0; fps.pcnt_inter = (double)intercount / cm->MBs; fps.pcnt_second_ref = (double)second_ref_count / cm->MBs; @@ -832,6 +870,12 @@ void vp9_first_pass(VP9_COMP *cpi) { twopass->this_frame_stats = fps; output_stats(&twopass->this_frame_stats, cpi->output_pkt_list); accumulate_stats(&twopass->total_stats, &fps); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list); + } +#endif } // Copy the previous Last Frame back into gf and and arf buffers if @@ -849,15 +893,15 @@ void vp9_first_pass(VP9_COMP *cpi) { ++twopass->sr_update_lag; } - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { + vp9_extend_frame_borders(new_yv12); + + if (is_spatial_svc(cpi)) { vp9_update_reference_frames(cpi); } else { // Swap frame pointers so last frame refers to the frame we just compressed. swap_yv12(lst_yv12, new_yv12); } - vp9_extend_frame_borders(lst_yv12); - // Special case for the first frame. Copy into the GF buffer as a second // reference. if (cm->current_video_frame == 0 && gld_yv12 != NULL) { @@ -881,6 +925,8 @@ void vp9_first_pass(VP9_COMP *cpi) { } ++cm->current_video_frame; + if (cpi->use_svc) + vp9_inc_frame_in_layer(&cpi->svc); } static double calc_correction_factor(double err_per_mb, @@ -901,56 +947,57 @@ static double calc_correction_factor(double err_per_mb, return fclamp(pow(error_term, power_term), 0.05, 5.0); } -int vp9_twopass_worst_quality(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh) { - int q; - const int num_mbs = cpi->common.MBs; - int target_norm_bits_per_mb; +static int get_twopass_worst_quality(const VP9_COMP *cpi, + const FIRSTPASS_STATS *stats, + int section_target_bandwidth) { const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; - const double section_err = fpstats->coded_error / fpstats->count; - const double err_per_mb = section_err / num_mbs; - const double speed_term = 1.0 + ((double)cpi->speed * 0.04); - - if (section_target_bandwitdh <= 0) - return rc->worst_quality; // Highest value allowed - - target_norm_bits_per_mb = - ((uint64_t)section_target_bandwitdh << BPER_MB_NORMBITS) / num_mbs; + if (section_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = cpi->common.MBs; + const double section_err = stats->coded_error / stats->count; + const double err_per_mb = section_err / num_mbs; + const double speed_term = 1.0 + 0.04 * oxcf->speed; + const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth << + BPER_MB_NORMBITS) / num_mbs; + int q; + int is_svc_upper_layer = 0; + if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) + is_svc_upper_layer = 1; + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + for (q = rc->best_quality; q < rc->worst_quality; ++q) { + const double factor = + calc_correction_factor(err_per_mb, ERR_DIVISOR, + is_svc_upper_layer ? SVC_FACTOR_PT_LOW : + FACTOR_PT_LOW, FACTOR_PT_HIGH, q); + const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q, + factor * speed_term); + if (bits_per_mb <= target_norm_bits_per_mb) + break; + } - // Try and pick a max Q that will be high enough to encode the - // content at the given rate. - for (q = rc->best_quality; q < rc->worst_quality; ++q) { - const double err_correction_factor = calc_correction_factor(err_per_mb, - ERR_DIVISOR, 0.5, 0.90, q); - const int bits_per_mb_at_this_q = - vp9_rc_bits_per_mb(INTER_FRAME, q, (err_correction_factor * speed_term)); - if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) - break; + // Restriction on active max q for constrained quality mode. + if (cpi->oxcf.rc_mode == VPX_CQ) + q = MAX(q, oxcf->cq_level); + return q; } - - // Restriction on active max q for constrained quality mode. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) - q = MAX(q, cpi->cq_target_quality); - - return q; } extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; - FIRSTPASS_STATS this_frame; - const FIRSTPASS_STATS *start_pos; - struct twopass_rc *twopass = &cpi->twopass; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; const int is_spatial_svc = (svc->number_spatial_layers > 1) && (svc->number_temporal_layers == 1); + TWO_PASS *const twopass = is_spatial_svc ? + &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; double frame_rate; - - if (is_spatial_svc) { - twopass = &svc->layer_context[svc->spatial_layer_id].twopass; - } + FIRSTPASS_STATS *stats; zero_stats(&twopass->total_stats); zero_stats(&twopass->total_left_stats); @@ -958,11 +1005,12 @@ void vp9_init_second_pass(VP9_COMP *cpi) { if (!twopass->stats_in_end) return; - twopass->total_stats = *twopass->stats_in_end; - twopass->total_left_stats = twopass->total_stats; + stats = &twopass->total_stats; + + *stats = *twopass->stats_in_end; + twopass->total_left_stats = *stats; - frame_rate = 10000000.0 * twopass->total_stats.count / - twopass->total_stats.duration; + frame_rate = 10000000.0 * stats->count / stats->duration; // Each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame // encoded in the second pass is a guess. However, the sum duration is not. @@ -971,18 +1019,15 @@ void vp9_init_second_pass(VP9_COMP *cpi) { if (is_spatial_svc) { vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(twopass->total_stats.duration * + twopass->bits_left = (int64_t)(stats->duration * svc->layer_context[svc->spatial_layer_id].target_bandwidth / 10000000.0); } else { vp9_new_framerate(cpi, frame_rate); - twopass->bits_left = (int64_t)(twopass->total_stats.duration * - oxcf->target_bandwidth / 10000000.0); + twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth / + 10000000.0); } - cpi->output_framerate = oxcf->framerate; - // Calculate a minimum intra value to be used in determining the IIratio // scores used in the second pass. We have this minimum to make sure // that clips that are static but "low complexity" in the intra domain @@ -997,46 +1042,26 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; - // Scan the first pass file and calculate an average Intra / Inter error - // score ratio for the sequence. - { - double sum_iiratio = 0.0; - start_pos = twopass->stats_in; - - while (input_stats(twopass, &this_frame) != EOF) { - const double iiratio = this_frame.intra_error / - DOUBLE_DIVIDE_CHECK(this_frame.coded_error); - sum_iiratio += fclamp(iiratio, 1.0, 20.0); - } - - twopass->avg_iiratio = sum_iiratio / - DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count); - - reset_fpf_position(twopass, start_pos); - } - // Scan the first pass file and calculate a modified total error based upon // the bias/power function used to allocate bits. { - double av_error = twopass->total_stats.ssim_weighted_pred_err / - DOUBLE_DIVIDE_CHECK(twopass->total_stats.count); - - start_pos = twopass->stats_in; - - twopass->modified_error_total = 0.0; - twopass->modified_error_min = - (av_error * oxcf->two_pass_vbrmin_section) / 100; - twopass->modified_error_max = - (av_error * oxcf->two_pass_vbrmax_section) / 100; - - while (input_stats(twopass, &this_frame) != EOF) { - twopass->modified_error_total += - calculate_modified_err(cpi, &this_frame); + const double avg_error = stats->coded_error / + DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = twopass->stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = (avg_error * + oxcf->two_pass_vbrmin_section) / 100; + twopass->modified_error_max = (avg_error * + oxcf->two_pass_vbrmax_section) / 100; + while (s < twopass->stats_in_end) { + modified_error_total += calculate_modified_err(twopass, oxcf, s); + ++s; } - twopass->modified_error_left = twopass->modified_error_total; - - reset_fpf_position(twopass, start_pos); + twopass->modified_error_left = modified_error_total; } + + // Reset the vbr bits off target counter + cpi->rc.vbr_bits_off_target = 0; } // This function gives an estimate of how badly we believe the prediction @@ -1054,10 +1079,23 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm, return MIN(second_ref_decay, next_frame->pcnt_inter); } +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const VP9_COMMON *cm, + const FIRSTPASS_STATS *frame) { + const double sr_ratio = frame->coded_error / + DOUBLE_DIVIDE_CHECK(frame->sr_coded_error); + const double zero_motion_pct = frame->pcnt_inter - + frame->pcnt_motion; + + return MIN(sr_ratio, zero_motion_pct); +} + + // Function to test for a condition where a complex transition is followed // by a static section. For example in slide shows where there is a fade // between slides. This is to help with more optimal kf and gf positioning. -static int detect_transition_to_still(struct twopass_rc *twopass, +static int detect_transition_to_still(TWO_PASS *twopass, int frame_interval, int still_interval, double loop_decay_rate, double last_decay_rate) { @@ -1095,74 +1133,59 @@ static int detect_transition_to_still(struct twopass_rc *twopass, // This function detects a flash through the high relative pcnt_second_ref // score in the frame following a flash frame. The offset passed in should // reflect this. -static int detect_flash(const struct twopass_rc *twopass, int offset) { - FIRSTPASS_STATS next_frame; - - int flash_detected = 0; - - // Read the frame data. - // The return is FALSE (no flash detected) if not a valid frame - if (read_frame_stats(twopass, &next_frame, offset) != EOF) { - // What we are looking for here is a situation where there is a - // brief break in prediction (such as a flash) but subsequent frames - // are reasonably well predicted by an earlier (pre flash) frame. - // The recovery after a flash is indicated by a high pcnt_second_ref - // compared to pcnt_inter. - if (next_frame.pcnt_second_ref > next_frame.pcnt_inter && - next_frame.pcnt_second_ref >= 0.5) - flash_detected = 1; - } - - return flash_detected; +static int detect_flash(const TWO_PASS *twopass, int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); + + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; } // Update the motion related elements to the GF arf boost calculation. -static void accumulate_frame_motion_stats( - FIRSTPASS_STATS *this_frame, - double *this_frame_mv_in_out, - double *mv_in_out_accumulator, - double *abs_mv_in_out_accumulator, - double *mv_ratio_accumulator) { - double motion_pct; - - // Accumulate motion stats. - motion_pct = this_frame->pcnt_motion; +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + double *mv_in_out, + double *mv_in_out_accumulator, + double *abs_mv_in_out_accumulator, + double *mv_ratio_accumulator) { + const double pct = stats->pcnt_motion; // Accumulate Motion In/Out of frame stats. - *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct; - *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct; - *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct); - - // Accumulate a measure of how uniform (or conversely how random) - // the motion field is (a ratio of absmv / mv). - if (motion_pct > 0.05) { - const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) / - DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr)); - - const double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) / - DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc)); - - *mv_ratio_accumulator += (this_frame_mvr_ratio < this_frame->mvr_abs) - ? (this_frame_mvr_ratio * motion_pct) - : this_frame->mvr_abs * motion_pct; - - *mv_ratio_accumulator += (this_frame_mvc_ratio < this_frame->mvc_abs) - ? (this_frame_mvc_ratio * motion_pct) - : this_frame->mvc_abs * motion_pct; + *mv_in_out = stats->mv_in_out_count * pct; + *mv_in_out_accumulator += *mv_in_out; + *abs_mv_in_out_accumulator += fabs(*mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = fabs(stats->mvr_abs) / + DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = fabs(stats->mvc_abs) / + DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + *mv_ratio_accumulator += pct * (mvr_ratio < stats->mvr_abs ? + mvr_ratio : stats->mvr_abs); + *mv_ratio_accumulator += pct * (mvc_ratio < stats->mvc_abs ? + mvc_ratio : stats->mvc_abs); } } // Calculate a baseline boost number for the current frame. -static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame, +static double calc_frame_boost(const TWO_PASS *twopass, + const FIRSTPASS_STATS *this_frame, double this_frame_mv_in_out) { double frame_boost; // Underlying boost factor is based on inter intra error ratio. - if (this_frame->intra_error > cpi->twopass.gf_intra_err_min) + if (this_frame->intra_error > twopass->gf_intra_err_min) frame_boost = (IIFACTOR * this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); else - frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min / + frame_boost = (IIFACTOR * twopass->gf_intra_err_min / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); // Increase boost for frames where new data coming into frame (e.g. zoom out). @@ -1180,8 +1203,7 @@ static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame, static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, int *f_boost, int *b_boost) { - FIRSTPASS_STATS this_frame; - struct twopass_rc *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->twopass; int i; double boost_score = 0.0; double mv_ratio_accumulator = 0.0; @@ -1194,11 +1216,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Search forward from the proposed arf/next gf position. for (i = 0; i < f_frames; ++i) { - if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF) + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. - accumulate_frame_motion_stats(&this_frame, + accumulate_frame_motion_stats(this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); @@ -1210,13 +1233,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Accumulate the effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame); + decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - boost_score += (decay_accumulator * - calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out)); + boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame, + this_frame_mv_in_out); } *f_boost = (int)boost_score; @@ -1231,11 +1254,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { - if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF) + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. - accumulate_frame_motion_stats(&this_frame, + accumulate_frame_motion_stats(this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); @@ -1247,13 +1271,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Cumulative effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame); + decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - boost_score += (decay_accumulator * - calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out)); + boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame, + this_frame_mv_in_out); } *b_boost = (int)boost_score; @@ -1264,152 +1288,236 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, return arf_boost; } -#if CONFIG_MULTIPLE_ARF -// Work out the frame coding order for a GF or an ARF group. -// The current implementation codes frames in their natural order for a -// GF group, and inserts additional ARFs into an ARF group using a -// binary split approach. -// NOTE: this function is currently implemented recursively. -static void schedule_frames(VP9_COMP *cpi, const int start, const int end, - const int arf_idx, const int gf_or_arf_group, - const int level) { - int i, abs_end, half_range; - int *cfo = cpi->frame_coding_order; - int idx = cpi->new_frame_coding_order_period; - - // If (end < 0) an ARF should be coded at position (-end). - assert(start >= 0); - - // printf("start:%d end:%d\n", start, end); - - // GF Group: code frames in logical order. - if (gf_or_arf_group == 0) { - assert(end >= start); - for (i = start; i <= end; ++i) { - cfo[idx] = i; - cpi->arf_buffer_idx[idx] = arf_idx; - cpi->arf_weight[idx] = -1; - ++idx; - } - cpi->new_frame_coding_order_period = idx; - return; +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; } - // ARF Group: Work out the ARF schedule and mark ARF frames as negative. - if (end < 0) { - // printf("start:%d end:%d\n", -end, -end); - // ARF frame is at the end of the range. - cfo[idx] = end; - // What ARF buffer does this ARF use as predictor. - cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2; - cpi->arf_weight[idx] = level; - ++idx; - abs_end = -end; + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +// Calculate the total bits to allocate in this GF/ARF group. +static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const TWO_PASS *const twopass = &cpi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); } else { - abs_end = end; + total_group_bits = 0; } - half_range = (abs_end - start) >> 1; - - // ARFs may not be adjacent, they must be separated by at least - // MIN_GF_INTERVAL non-ARF frames. - if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) { - // printf("start:%d end:%d\n", start, abs_end); - // Update the coding order and active ARF. - for (i = start; i <= abs_end; ++i) { - cfo[idx] = i; - cpi->arf_buffer_idx[idx] = arf_idx; - cpi->arf_weight[idx] = -1; - ++idx; - } - cpi->new_frame_coding_order_period = idx; - } else { - // Place a new ARF at the mid-point of the range. - cpi->new_frame_coding_order_period = idx; - schedule_frames(cpi, start, -(start + half_range), arf_idx + 1, - gf_or_arf_group, level + 1); - schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx, - gf_or_arf_group, level + 1); + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) ? + 0 : (total_group_bits > twopass->kf_group_bits) ? + twopass->kf_group_bits : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number bits extra to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, + int boost, int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0) || (frame_count <= 0) ) + return 0; + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0); } -#define FIXED_ARF_GROUP_SIZE 16 +// Current limit on maximum number of active arfs in a GF/ARF group. +#define MAX_ACTIVE_ARFS 2 +#define ARF_SLOT1 2 +#define ARF_SLOT2 3 +// This function indirects the choice of buffers for arfs. +// At the moment the values are fixed but this may change as part of +// the integration process with other codec features that swap buffers around. +static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { + arf_buffer_indices[0] = ARF_SLOT1; + arf_buffer_indices[1] = ARF_SLOT2; +} -void define_fixed_arf_period(VP9_COMP *cpi) { +static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, + double group_error, int gf_arf_bits) { + RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *twopass = &cpi->twopass; + FIRSTPASS_STATS frame_stats; int i; - int max_level = INT_MIN; - - assert(cpi->multi_arf_enabled); - assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE); - - // Save the weight of the last frame in the sequence before next - // sequence pattern overwrites it. - cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number]; - assert(cpi->this_frame_weight >= 0); - - cpi->twopass.gf_zeromotion_pct = 0; - - // Initialize frame coding order variables. - cpi->new_frame_coding_order_period = 0; - cpi->next_frame_in_order = 0; - cpi->arf_buffered = 0; - vp9_zero(cpi->frame_coding_order); - vp9_zero(cpi->arf_buffer_idx); - vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight)); - - if (cpi->rc.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) { - // Setup a GF group close to the keyframe. - cpi->rc.source_alt_ref_pending = 0; - cpi->rc.baseline_gf_interval = cpi->rc.frames_to_key; - schedule_frames(cpi, 0, (cpi->rc.baseline_gf_interval - 1), 2, 0, 0); - } else { - // Setup a fixed period ARF group. - cpi->rc.source_alt_ref_pending = 1; - cpi->rc.baseline_gf_interval = FIXED_ARF_GROUP_SIZE; - schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0); - } + int frame_index = 1; + int target_frame_size; + int key_frame; + const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + int64_t total_group_bits = gf_group_bits; + double modified_err = 0.0; + double err_fraction; + int mid_boost_bits = 0; + int mid_frame_idx; + unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; + + key_frame = cpi->common.frame_type == KEY_FRAME || + vp9_is_upper_layer_key_frame(cpi); + + get_arf_buffer_indices(arf_buffer_indices); + + // For key frames the frame target rate is already set and it + // is also the golden frame. + if (!key_frame) { + if (rc->source_alt_ref_active) { + twopass->gf_group.update_type[0] = OVERLAY_UPDATE; + twopass->gf_group.rf_level[0] = INTER_NORMAL; + twopass->gf_group.bit_allocation[0] = 0; + twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0]; + twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0]; + } else { + twopass->gf_group.update_type[0] = GF_UPDATE; + twopass->gf_group.rf_level[0] = GF_ARF_STD; + twopass->gf_group.bit_allocation[0] = gf_arf_bits; + twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0]; + twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0]; + } + + // Step over the golden frame / overlay frame + if (EOF == input_stats(twopass, &frame_stats)) + return; + } + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (rc->source_alt_ref_pending || !key_frame) + total_group_bits -= gf_arf_bits; + + // Store the bits to spend on the ARF if there is one. + if (rc->source_alt_ref_pending) { + twopass->gf_group.update_type[frame_index] = ARF_UPDATE; + twopass->gf_group.rf_level[frame_index] = GF_ARF_STD; + twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits; + twopass->gf_group.arf_src_offset[frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0]; + twopass->gf_group.arf_ref_idx[frame_index] = + arf_buffer_indices[cpi->multi_arf_last_grp_enabled && + rc->source_alt_ref_active]; + ++frame_index; - // Replace level indicator of -1 with correct level. - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] > max_level) { - max_level = cpi->arf_weight[i]; + if (cpi->multi_arf_enabled) { + // Set aside a slot for a level 1 arf. + twopass->gf_group.update_type[frame_index] = ARF_UPDATE; + twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW; + twopass->gf_group.arf_src_offset[frame_index] = + (unsigned char)((rc->baseline_gf_interval >> 1) - 1); + twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1]; + twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0]; + ++frame_index; } } - ++max_level; - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] == -1) { - cpi->arf_weight[i] = max_level; + + // Define middle frame + mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; + + // Allocate bits to the other frames in the group. + for (i = 0; i < rc->baseline_gf_interval - 1; ++i) { + int arf_idx = 0; + if (EOF == input_stats(twopass, &frame_stats)) + break; + + modified_err = calculate_modified_err(twopass, oxcf, &frame_stats); + + if (group_error > 0) + err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error); + else + err_fraction = 0.0; + + target_frame_size = (int)((double)total_group_bits * err_fraction); + + if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { + mid_boost_bits += (target_frame_size >> 4); + target_frame_size -= (target_frame_size >> 4); + + if (frame_index <= mid_frame_idx) + arf_idx = 1; } + twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; + twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; + + target_frame_size = clamp(target_frame_size, 0, + MIN(max_bits, (int)total_group_bits)); + + twopass->gf_group.update_type[frame_index] = LF_UPDATE; + twopass->gf_group.rf_level[frame_index] = INTER_NORMAL; + + twopass->gf_group.bit_allocation[frame_index] = target_frame_size; + ++frame_index; } - cpi->max_arf_level = max_level; -#if 0 - printf("\nSchedule: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->frame_coding_order[i]); - } - printf("\n"); - printf("ARFref: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_buffer_idx[i]); - } - printf("\n"); - printf("Weight: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_weight[i]); + + // Note: + // We need to configure the frame at the end of the sequence + 1 that will be + // the start frame for the next group. Otherwise prior to the call to + // vp9_rc_get_second_pass_params() the data will be undefined. + twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0]; + twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0]; + + if (rc->source_alt_ref_pending) { + twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE; + twopass->gf_group.rf_level[frame_index] = INTER_NORMAL; + + // Final setup for second arf and its overlay. + if (cpi->multi_arf_enabled) { + twopass->gf_group.bit_allocation[2] = + twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits; + twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE; + twopass->gf_group.bit_allocation[mid_frame_idx] = 0; + } + } else { + twopass->gf_group.update_type[frame_index] = GF_UPDATE; + twopass->gf_group.rf_level[frame_index] = GF_ARF_STD; } - printf("\n"); -#endif + + // Note whether multi-arf was enabled this group for next time. + cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } -#endif // Analyse and define a gf/arf group. static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { RATE_CONTROL *const rc = &cpi->rc; - VP9_CONFIG *const oxcf = &cpi->oxcf; - struct twopass_rc *const twopass = &cpi->twopass; - FIRSTPASS_STATS next_frame = { 0 }; - const FIRSTPASS_STATS *start_pos; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; int i; + double boost_score = 0.0; double old_boost_score = 0.0; double gf_group_err = 0.0; @@ -1427,23 +1535,29 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double mv_ratio_accumulator_thresh; - // Max bits for a single frame. - const int max_bits = frame_max_bits(rc, oxcf); - unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames; + unsigned int allow_alt_ref = is_altref_enabled(cpi); int f_boost = 0; int b_boost = 0; int flash_detected; int active_max_gf_interval; + int64_t gf_group_bits; + double gf_group_error_left; + int gf_arf_bits; - twopass->gf_group_bits = 0; + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (cpi->common.frame_type != KEY_FRAME) { + vp9_zero(twopass->gf_group); + } vp9_clear_system_state(); + vp9_zero(next_frame); - start_pos = twopass->stats_in; + gf_group_bits = 0; // Load stats for the current frame. - mod_frame_err = calculate_modified_err(cpi, this_frame); + mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame); // Note the error of the frame at the start of the group. This will be // the GF frame error if we code a normal gf. @@ -1457,25 +1571,28 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 10.0; - // Work out a maximum interval for the GF. - // If the image appears completely static we can extend beyond this. - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - // - active_max_gf_interval = - 12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5); - - if (active_max_gf_interval > rc->max_gf_interval) + // Work out a maximum interval for the GF group. + // If the image appears almost completely static we can extend beyond this. + if (cpi->multi_arf_allowed) { active_max_gf_interval = rc->max_gf_interval; + } else { + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = + 12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5); + + if (active_max_gf_interval > rc->max_gf_interval) + active_max_gf_interval = rc->max_gf_interval; + } i = 0; while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { ++i; // Accumulate error score of frames in this gf group. - mod_frame_err = calculate_modified_err(cpi, this_frame); + mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame); gf_group_err += mod_frame_err; if (EOF == input_stats(twopass, &next_frame)) @@ -1498,11 +1615,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { decay_accumulator = decay_accumulator * loop_decay_rate; // Monitor for static sections. - if ((next_frame.pcnt_inter - next_frame.pcnt_motion) < - zero_motion_accumulator) { - zero_motion_accumulator = next_frame.pcnt_inter - - next_frame.pcnt_motion; - } + zero_motion_accumulator = + MIN(zero_motion_accumulator, + get_zero_motion_factor(&cpi->common, &next_frame)); // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -1514,12 +1629,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Calculate a boost number for this frame. - boost_score += (decay_accumulator * - calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out)); + boost_score += decay_accumulator * calc_frame_boost(twopass, &next_frame, + this_frame_mv_in_out); // Break out conditions. if ( - // Break at cpi->max_gf_interval unless almost totally static. + // Break at active_max_gf_interval unless almost totally static. (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) || ( // Don't break out with a very short interval. @@ -1550,30 +1665,20 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { break; if (i < rc->frames_to_key) { - mod_frame_err = calculate_modified_err(cpi, this_frame); + mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame); gf_group_err += mod_frame_err; } } } -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - // Initialize frame coding order variables. - cpi->new_frame_coding_order_period = 0; - cpi->next_frame_in_order = 0; - cpi->arf_buffered = 0; - vp9_zero(cpi->frame_coding_order); - vp9_zero(cpi->arf_buffer_idx); - vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight)); - } -#endif - // Set the interval until the next gf. if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active) rc->baseline_gf_interval = i - 1; else rc->baseline_gf_interval = i; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && @@ -1586,240 +1691,66 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { &b_boost); rc->source_alt_ref_pending = 1; -#if CONFIG_MULTIPLE_ARF - // Set the ARF schedule. - if (cpi->multi_arf_enabled) { - schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0); - } -#endif + // Test to see if multi arf is appropriate. + cpi->multi_arf_enabled = + (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && + (zero_motion_accumulator < 0.995)) ? 1 : 0; } else { rc->gfu_boost = (int)boost_score; rc->source_alt_ref_pending = 0; -#if CONFIG_MULTIPLE_ARF - // Set the GF schedule. - if (cpi->multi_arf_enabled) { - schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0); - assert(cpi->new_frame_coding_order_period == - rc->baseline_gf_interval); - } -#endif - } - -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) { - int max_level = INT_MIN; - // Replace level indicator of -1 with correct level. - for (i = 0; i < cpi->frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] > max_level) { - max_level = cpi->arf_weight[i]; - } - } - ++max_level; - for (i = 0; i < cpi->frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] == -1) { - cpi->arf_weight[i] = max_level; - } - } - cpi->max_arf_level = max_level; - } -#if 0 - if (cpi->multi_arf_enabled) { - printf("\nSchedule: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->frame_coding_order[i]); - } - printf("\n"); - printf("ARFref: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_buffer_idx[i]); - } - printf("\n"); - printf("Weight: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_weight[i]); - } - printf("\n"); - } -#endif -#endif - - // Calculate the bits to be allocated to the group as a whole. - if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) { - twopass->gf_group_bits = (int64_t)(twopass->kf_group_bits * - (gf_group_err / twopass->kf_group_error_left)); - } else { - twopass->gf_group_bits = 0; } - twopass->gf_group_bits = (twopass->gf_group_bits < 0) ? - 0 : (twopass->gf_group_bits > twopass->kf_group_bits) ? - twopass->kf_group_bits : twopass->gf_group_bits; - - // Clip cpi->twopass.gf_group_bits based on user supplied data rate - // variability limit, cpi->oxcf.two_pass_vbrmax_section. - if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) - twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; // Reset the file position. reset_fpf_position(twopass, start_pos); - // Assign bits to the arf or gf. - for (i = 0; i <= (rc->source_alt_ref_pending && - cpi->common.frame_type != KEY_FRAME); ++i) { - int allocation_chunks; - int q = rc->last_q[INTER_FRAME]; - int gf_bits; + // Calculate the bits to be allocated to the gf/arf group as a whole + gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + // Calculate the extra bits to be used for boosted frame(s) + { + int q = rc->last_q[INTER_FRAME]; int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100; // Set max and minimum boost and hence minimum allocation. boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200); - if (rc->source_alt_ref_pending && i == 0) - allocation_chunks = ((rc->baseline_gf_interval + 1) * 100) + boost; - else - allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100); - - // Prevent overflow. - if (boost > 1023) { - int divisor = boost >> 10; - boost /= divisor; - allocation_chunks /= divisor; - } - - // Calculate the number of bits to be spent on the gf or arf based on - // the boost number. - gf_bits = (int)((double)boost * (twopass->gf_group_bits / - (double)allocation_chunks)); - - // If the frame that is to be boosted is simpler than the average for - // the gf/arf group then use an alternative calculation - // based on the error score of the frame itself. - if (rc->baseline_gf_interval < 1 || - mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) { - double alt_gf_grp_bits = (double)twopass->kf_group_bits * - (mod_frame_err * (double)rc->baseline_gf_interval) / - DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left); - - int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits / - (double)allocation_chunks)); - - if (gf_bits > alt_gf_bits) - gf_bits = alt_gf_bits; - } else { - // If it is harder than other frames in the group make sure it at - // least receives an allocation in keeping with its relative error - // score, otherwise it may be worse off than an "un-boosted" frame. - int alt_gf_bits = (int)((double)twopass->kf_group_bits * - mod_frame_err / - DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left)); - - if (alt_gf_bits > gf_bits) - gf_bits = alt_gf_bits; - } - - // Don't allow a negative value for gf_bits. - if (gf_bits < 0) - gf_bits = 0; - - if (i == 0) { - twopass->gf_bits = gf_bits; - } - if (i == 1 || - (!rc->source_alt_ref_pending && - cpi->common.frame_type != KEY_FRAME)) { - // Calculate the per frame bit target for this frame. - vp9_rc_set_frame_target(cpi, gf_bits); - } + // Calculate the extra bits to be used for boosted frame(s) + gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, + boost, gf_group_bits); } - { - // Adjust KF group bits and error remaining. - twopass->kf_group_error_left -= (int64_t)gf_group_err; - - // If this is an arf update we want to remove the score for the overlay - // frame at the end which will usually be very cheap to code. - // The overlay frame has already, in effect, been coded so we want to spread - // the remaining bits among the other frames. - // For normal GFs remove the score for the GF itself unless this is - // also a key frame in which case it has already been accounted for. - if (rc->source_alt_ref_pending) { - twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err); - } else if (cpi->common.frame_type != KEY_FRAME) { - twopass->gf_group_error_left = (int64_t)(gf_group_err - - gf_first_frame_err); - } else { - twopass->gf_group_error_left = (int64_t)gf_group_err; - } + // Adjust KF group bits and error remaining. + twopass->kf_group_error_left -= (int64_t)gf_group_err; - // This condition could fail if there are two kfs very close together - // despite MIN_GF_INTERVAL and would cause a divide by 0 in the - // calculation of alt_extra_bits. - if (rc->baseline_gf_interval >= 3) { - const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost; - - if (boost >= 150) { - const int pct_extra = MIN(20, (boost - 100) / 50); - const int alt_extra_bits = (int)(( - MAX(twopass->gf_group_bits - twopass->gf_bits, 0) * - pct_extra) / 100); - twopass->gf_group_bits -= alt_extra_bits; - } - } + // If this is an arf update we want to remove the score for the overlay + // frame at the end which will usually be very cheap to code. + // The overlay frame has already, in effect, been coded so we want to spread + // the remaining bits among the other frames. + // For normal GFs remove the score for the GF itself unless this is + // also a key frame in which case it has already been accounted for. + if (rc->source_alt_ref_pending) { + gf_group_error_left = gf_group_err - mod_frame_err; + } else if (cpi->common.frame_type != KEY_FRAME) { + gf_group_error_left = gf_group_err - gf_first_frame_err; + } else { + gf_group_error_left = gf_group_err; } - if (cpi->common.frame_type != KEY_FRAME) { - FIRSTPASS_STATS sectionstats; - - zero_stats(§ionstats); - reset_fpf_position(twopass, start_pos); + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits); - for (i = 0; i < rc->baseline_gf_interval; ++i) { - input_stats(twopass, &next_frame); - accumulate_stats(§ionstats, &next_frame); - } - - avg_stats(§ionstats); - - twopass->section_intra_rating = (int) - (sectionstats.intra_error / - DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); + // Reset the file position. + reset_fpf_position(twopass, start_pos); - reset_fpf_position(twopass, start_pos); + // Calculate a section intra ratio used in setting max loop filter. + if (cpi->common.frame_type != KEY_FRAME) { + twopass->section_intra_rating = + calculate_section_intra_ratio(start_pos, twopass->stats_in_end, + rc->baseline_gf_interval); } } -// Allocate bits to a normal frame that is neither a gf an arf or a key frame. -static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { - struct twopass_rc *twopass = &cpi->twopass; - // For a single frame. - const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); - // Calculate modified prediction error used in bit allocation. - const double modified_err = calculate_modified_err(cpi, this_frame); - int target_frame_size; - double err_fraction; - - if (twopass->gf_group_error_left > 0) - // What portion of the remaining GF group error is used by this frame. - err_fraction = modified_err / twopass->gf_group_error_left; - else - err_fraction = 0.0; - - // How many of those bits available for allocation should we give it? - target_frame_size = (int)((double)twopass->gf_group_bits * err_fraction); - - // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at - // the top end. - target_frame_size = clamp(target_frame_size, 0, - MIN(max_bits, (int)twopass->gf_group_bits)); - - // Adjust error and bits remaining. - twopass->gf_group_error_left -= (int64_t)modified_err; - - // Per frame bit target for this frame. - vp9_rc_set_frame_target(cpi, target_frame_size); -} - -static int test_candidate_kf(struct twopass_rc *twopass, +static int test_candidate_kf(TWO_PASS *twopass, const FIRSTPASS_STATS *last_frame, const FIRSTPASS_STATS *this_frame, const FIRSTPASS_STATS *next_frame) { @@ -1899,11 +1830,13 @@ static int test_candidate_kf(struct twopass_rc *twopass, static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i, j; RATE_CONTROL *const rc = &cpi->rc; - struct twopass_rc *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->twopass; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; const FIRSTPASS_STATS first_frame = *this_frame; - const FIRSTPASS_STATS *start_position = twopass->stats_in; + const FIRSTPASS_STATS *const start_position = twopass->stats_in; FIRSTPASS_STATS next_frame; FIRSTPASS_STATS last_frame; + int kf_bits = 0; double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; double boost_score = 0.0; @@ -1915,11 +1848,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->common.frame_type = KEY_FRAME; + // Reset the GF group data structures. + vp9_zero(twopass->gf_group); + // Is this a forced key frame by interval. rc->this_key_frame_forced = rc->next_key_frame_forced; - // Clear the alt ref active flag as this can never be active on a key frame. + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. rc->source_alt_ref_active = 0; + cpi->multi_arf_last_grp_enabled = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; @@ -1929,13 +1867,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_group_bits = 0; // Total bits available to kf group twopass->kf_group_error_left = 0; // Group modified error score. - kf_mod_err = calculate_modified_err(cpi, this_frame); + kf_mod_err = calculate_modified_err(twopass, oxcf, this_frame); // Find the next keyframe. i = 0; - while (twopass->stats_in < twopass->stats_in_end) { + while (twopass->stats_in < twopass->stats_in_end && + rc->frames_to_key < cpi->oxcf.key_freq) { // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, this_frame); + kf_group_err += calculate_modified_err(twopass, oxcf, this_frame); // Load the next frame's stats. last_frame = *this_frame; @@ -1963,7 +1902,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Special check for transition or high motion followed by a // static scene. - if (detect_transition_to_still(twopass, i, cpi->key_frame_frequency - i, + if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i, loop_decay_rate, decay_accumulator)) break; @@ -1971,8 +1910,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ++rc->frames_to_key; // If we don't have a real key frame within the next two - // key_frame_frequency intervals then break out of the loop. - if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency) + // key_freq intervals then break out of the loop. + if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break; } else { ++rc->frames_to_key; @@ -1985,7 +1924,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // This code centers the extra kf if the actual natural interval // is between 1x and 2x. if (cpi->oxcf.auto_key && - rc->frames_to_key > (int)cpi->key_frame_frequency) { + rc->frames_to_key > cpi->oxcf.key_freq) { FIRSTPASS_STATS tmp_frame = first_frame; rc->frames_to_key /= 2; @@ -1997,11 +1936,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Rescan to get the correct error data for the forced kf group. for (i = 0; i < rc->frames_to_key; ++i) { - kf_group_err += calculate_modified_err(cpi, &tmp_frame); + kf_group_err += calculate_modified_err(twopass, oxcf, &tmp_frame); input_stats(twopass, &tmp_frame); } rc->next_key_frame_forced = 1; - } else if (twopass->stats_in == twopass->stats_in_end) { + } else if (twopass->stats_in == twopass->stats_in_end || + rc->frames_to_key >= cpi->oxcf.key_freq) { rc->next_key_frame_forced = 1; } else { rc->next_key_frame_forced = 0; @@ -2010,7 +1950,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, this_frame); + kf_group_err += calculate_modified_err(twopass, oxcf, this_frame); } // Calculate the number of bits that should be assigned to the kf group. @@ -2033,25 +1973,23 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } else { twopass->kf_group_bits = 0; } + twopass->kf_group_bits = MAX(0, twopass->kf_group_bits); + // Reset the first pass file position. reset_fpf_position(twopass, start_position); - // Determine how big to make this keyframe based on how well the subsequent - // frames use inter blocks. + // Scan through the kf group collating various stats used to deteermine + // how many bits to spend on it. decay_accumulator = 1.0; boost_score = 0.0; - - // Scan through the kf group collating various stats. for (i = 0; i < rc->frames_to_key; ++i) { if (EOF == input_stats(twopass, &next_frame)) break; // Monitor for static sections. - if ((next_frame.pcnt_inter - next_frame.pcnt_motion) < - zero_motion_accumulator) { - zero_motion_accumulator = (next_frame.pcnt_inter - - next_frame.pcnt_motion); - } + zero_motion_accumulator = + MIN(zero_motion_accumulator, + get_zero_motion_factor(&cpi->common, &next_frame)); // For the first few frames collect data to decide kf boost. if (i <= (rc->max_gf_interval * 2)) { @@ -2078,101 +2016,33 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } - { - FIRSTPASS_STATS sectionstats; + reset_fpf_position(twopass, start_position); - zero_stats(§ionstats); - reset_fpf_position(twopass, start_position); + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); - for (i = 0; i < rc->frames_to_key; ++i) { - input_stats(twopass, &next_frame); - accumulate_stats(§ionstats, &next_frame); - } + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = + calculate_section_intra_ratio(start_position, twopass->stats_in_end, + rc->frames_to_key); - avg_stats(§ionstats); + // Work out how many bits to allocate for the key frame itself. + rc->kf_boost = (int)boost_score; - twopass->section_intra_rating = (int) (sectionstats.intra_error / - DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); - } + if (rc->kf_boost < (rc->frames_to_key * 3)) + rc->kf_boost = (rc->frames_to_key * 3); + if (rc->kf_boost < MIN_KF_BOOST) + rc->kf_boost = MIN_KF_BOOST; - // Reset the first pass file position. - reset_fpf_position(twopass, start_position); + kf_bits = calculate_boost_bits((rc->frames_to_key - 1), + rc->kf_boost, twopass->kf_group_bits); - // Work out how many bits to allocate for the key frame itself. - if (1) { - int kf_boost = (int)boost_score; - int allocation_chunks; - - if (kf_boost < (rc->frames_to_key * 3)) - kf_boost = (rc->frames_to_key * 3); - - if (kf_boost < MIN_KF_BOOST) - kf_boost = MIN_KF_BOOST; - - // Make a note of baseline boost and the zero motion - // accumulator value for use elsewhere. - rc->kf_boost = kf_boost; - twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); - - // Key frame size depends on: - // (1) the error score for the whole key frame group, - // (2) the key frames' own error if this is smaller than the - // average for the group (optional), - // (3) insuring that the frame receives at least the allocation it would - // have received based on its own error score vs the error score - // remaining. - // Special case: - // If the sequence appears almost totally static we want to spend almost - // all of the bits on the key frame. - // - // We use (cpi->rc.frames_to_key - 1) below because the key frame itself is - // taken care of by kf_boost. - if (zero_motion_accumulator >= 0.99) { - allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost; - } else { - allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost; - } - - // Prevent overflow. - if (kf_boost > 1028) { - const int divisor = kf_boost >> 10; - kf_boost /= divisor; - allocation_chunks /= divisor; - } + twopass->kf_group_bits -= kf_bits; - twopass->kf_group_bits = MAX(0, twopass->kf_group_bits); - // Calculate the number of bits to be spent on the key frame. - twopass->kf_bits = (int)((double)kf_boost * - ((double)twopass->kf_group_bits / allocation_chunks)); - - // If the key frame is actually easier than the average for the - // kf group (which does sometimes happen, e.g. a blank intro frame) - // then use an alternate calculation based on the kf error score - // which should give a smaller key frame. - if (kf_mod_err < kf_group_err / rc->frames_to_key) { - double alt_kf_grp_bits = ((double)twopass->bits_left * - (kf_mod_err * (double)rc->frames_to_key) / - DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)); - - const int alt_kf_bits = (int)((double)kf_boost * - (alt_kf_grp_bits / (double)allocation_chunks)); - - if (twopass->kf_bits > alt_kf_bits) - twopass->kf_bits = alt_kf_bits; - } else { - // Else if it is much harder than other frames in the group make sure - // it at least receives an allocation in keeping with its relative - // error score. - const int alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err / - DOUBLE_DIVIDE_CHECK(twopass->modified_error_left))); - - if (alt_kf_bits > twopass->kf_bits) - twopass->kf_bits = alt_kf_bits; - } - twopass->kf_group_bits -= twopass->kf_bits; - // Per frame bit target for this frame. - vp9_rc_set_frame_target(cpi, twopass->kf_bits); - } + // Save the bits to spend on the key frame. + twopass->gf_group.bit_allocation[0] = kf_bits; + twopass->gf_group.update_type[0] = KF_UPDATE; + twopass->gf_group.rf_level[0] = KF_STD; // Note the total error score of the kf group minus the key frame itself. twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); @@ -2183,34 +2053,80 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->modified_error_left -= kf_group_err; } -void vp9_rc_get_first_pass_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - if (!cpi->refresh_alt_ref_frame && - (cm->current_video_frame == 0 || - (cm->frame_flags & FRAMEFLAGS_KEY))) { - cm->frame_type = KEY_FRAME; +// For VBR...adjustment to the frame target based on error from previous frames +void vbr_rate_correction(int * this_frame_target, + const int64_t vbr_bits_off_target) { + int max_delta = (*this_frame_target * 15) / 100; + + // vbr_bits_off_target > 0 means we have extra bits to spend + if (vbr_bits_off_target > 0) { + *this_frame_target += + (vbr_bits_off_target > max_delta) ? max_delta + : (int)vbr_bits_off_target; } else { - cm->frame_type = INTER_FRAME; + *this_frame_target -= + (vbr_bits_off_target < -max_delta) ? max_delta + : (int)-vbr_bits_off_target; } - // Do not use periodic key frames. - cpi->rc.frames_to_key = INT_MAX; } +// Define the reference buffers that will be updated post encode. +void configure_buffer_updates(VP9_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + + cpi->rc.is_src_frame_alt_ref = 0; + switch (twopass->gf_group.update_type[twopass->gf_group.index]) { + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + case GF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case ARF_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + default: + assert(0); + break; + } + if (is_spatial_svc(cpi)) { + if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) + cpi->refresh_golden_frame = 0; + if (cpi->alt_ref_source == NULL) + cpi->refresh_alt_ref_frame = 0; + } +} + + void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - struct twopass_rc *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->twopass; int frames_left; FIRSTPASS_STATS this_frame; FIRSTPASS_STATS this_frame_copy; - double this_frame_intra_error; - double this_frame_coded_error; - int target; + int target_rate; LAYER_CONTEXT *lc = NULL; - int is_spatial_svc = (cpi->use_svc && cpi->svc.number_temporal_layers == 1); - if (is_spatial_svc) { + if (is_spatial_svc(cpi)) { lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; frames_left = (int)(twopass->total_stats.count - lc->current_video_frame_in_layer); @@ -2222,27 +2138,52 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (!twopass->stats_in) return; - if (cpi->refresh_alt_ref_frame) { + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) { + int target_rate; + configure_buffer_updates(cpi); + target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index]; + target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); + rc->base_frame_target = target_rate; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR) + vbr_rate_correction(&target_rate, rc->vbr_bits_off_target); + + vp9_rc_set_frame_target(cpi, target_rate); cm->frame_type = INTER_FRAME; - vp9_rc_set_frame_target(cpi, twopass->gf_bits); + + if (is_spatial_svc(cpi)) { + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = 0; + } else { + lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + + if (lc->is_key_frame) + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + } + } + return; } vp9_clear_system_state(); - if (is_spatial_svc && twopass->kf_intra_err_min == 0) { + if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) { twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; } - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + if (cpi->oxcf.rc_mode == VPX_Q) { twopass->active_worst_quality = cpi->oxcf.cq_level; } else if (cm->current_video_frame == 0 || - (is_spatial_svc && lc->current_video_frame_in_layer == 0)) { + (is_spatial_svc(cpi) && + lc->current_video_frame_in_layer == 0)) { // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); - const int tmp_q = vp9_twopass_worst_quality(cpi, &twopass->total_left_stats, + const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats, section_target_bandwidth); twopass->active_worst_quality = tmp_q; rc->ni_av_qi = tmp_q; @@ -2252,38 +2193,37 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (EOF == input_stats(twopass, &this_frame)) return; - this_frame_intra_error = this_frame.intra_error; - this_frame_coded_error = this_frame.coded_error; + // Local copy of the current frame's first pass stats. + this_frame_copy = this_frame; // Keyframe and section processing. if (rc->frames_to_key == 0 || - (cm->frame_flags & FRAMEFLAGS_KEY)) { + (cpi->frame_flags & FRAMEFLAGS_KEY)) { // Define next KF group and assign bits to it. - this_frame_copy = this_frame; find_next_key_frame(cpi, &this_frame_copy); - // Don't place key frame in any enhancement layers in spatial svc - if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 && - cpi->svc.spatial_layer_id > 0) { - cm->frame_type = INTER_FRAME; - } } else { cm->frame_type = INTER_FRAME; } - // Is this frame a GF / ARF? (Note: a key frame is always also a GF). - if (rc->frames_till_gf_update_due == 0) { - // Define next gf group and assign bits to it. - this_frame_copy = this_frame; - -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - define_fixed_arf_period(cpi); + if (is_spatial_svc(cpi)) { + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = (cm->frame_type == KEY_FRAME); + if (lc->is_key_frame) + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); } else { -#endif - define_gf_group(cpi, &this_frame_copy); -#if CONFIG_MULTIPLE_ARF + cm->frame_type = INTER_FRAME; + lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + + if (lc->is_key_frame) { + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + } } -#endif + } + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, &this_frame_copy); if (twopass->gf_zeromotion_pct > 995) { // As long as max_thresh for encode breakout is small enough, it is ok @@ -2296,53 +2236,50 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { } rc->frames_till_gf_update_due = rc->baseline_gf_interval; - cpi->refresh_golden_frame = 1; - } else { - // Otherwise this is an ordinary frame. - // Assign bits from those allocated to the GF group. - this_frame_copy = this_frame; - assign_std_frame_bits(cpi, &this_frame_copy); + if (!is_spatial_svc(cpi)) + cpi->refresh_golden_frame = 1; } - // Keep a globally available copy of this and the next frame's iiratio. - twopass->this_iiratio = (int)(this_frame_intra_error / - DOUBLE_DIVIDE_CHECK(this_frame_coded_error)); - { - FIRSTPASS_STATS next_frame; - if (lookup_next_frame_stats(twopass, &next_frame) != EOF) { - twopass->next_iiratio = (int)(next_frame.intra_error / - DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); - } - } + configure_buffer_updates(cpi); + target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index]; if (cpi->common.frame_type == KEY_FRAME) - target = vp9_rc_clamp_iframe_target_size(cpi, rc->this_frame_target); + target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate); else - target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target); - vp9_rc_set_frame_target(cpi, target); + target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); + + rc->base_frame_target = target_rate; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR) + vbr_rate_correction(&target_rate, rc->vbr_bits_off_target); + + vp9_rc_set_frame_target(cpi, target_rate); // Update the total stats remaining structure. subtract_stats(&twopass->total_left_stats, &this_frame); } void vp9_twopass_postencode_update(VP9_COMP *cpi) { -#ifdef DISABLE_RC_LONG_TERM_MEM - const uint64_t bits_used = cpi->rc.this_frame_target; -#else - const uint64_t bits_used = cpi->rc.projected_frame_size; -#endif - cpi->twopass.bits_left -= bits_used; - cpi->twopass.bits_left = MAX(cpi->twopass.bits_left, 0); - // Update bits left to the kf and gf groups to account for overshoot or - // undershoot on these frames. - if (cpi->common.frame_type == KEY_FRAME) { - // For key frames kf_group_bits already had the target bits subtracted out. - // So now update to the correct value based on the actual bits used. - cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - bits_used; - } else { - cpi->twopass.kf_group_bits -= bits_used; - cpi->twopass.gf_group_bits -= bits_used; - cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0); + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + const int bits_used = rc->base_frame_target; + rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + + twopass->bits_left = MAX(twopass->bits_left - bits_used, 0); + + if (cpi->common.frame_type != KEY_FRAME && + !vp9_is_upper_layer_key_frame(cpi)) { + twopass->kf_group_bits -= bits_used; } - cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0); + twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0); + + // Increment the gf group index ready for the next frame. + ++twopass->gf_group.index; } diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h index 7a16c8fbf..bf8c9fd96 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.h +++ b/libvpx/vp9/encoder/vp9_firstpass.h @@ -11,16 +11,39 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_ratectrl.h" + #ifdef __cplusplus extern "C" { #endif +#if CONFIG_FP_MB_STATS + +#define FPMB_DCINTRA_MASK 0x01 + +#define FPMB_MOTION_ZERO_MASK 0x02 +#define FPMB_MOTION_LEFT_MASK 0x04 +#define FPMB_MOTION_RIGHT_MASK 0x08 +#define FPMB_MOTION_UP_MASK 0x10 +#define FPMB_MOTION_DOWN_MASK 0x20 + +#define FPMB_ERROR_SMALL_MASK 0x40 +#define FPMB_ERROR_LARGE_MASK 0x80 +#define FPMB_ERROR_SMALL_TH 2000 +#define FPMB_ERROR_LARGE_TH 48000 + +typedef struct { + uint8_t *mb_stats_start; + uint8_t *mb_stats_end; +} FIRSTPASS_MB_STATS; +#endif + typedef struct { double frame; double intra_error; double coded_error; double sr_coded_error; - double ssim_weighted_pred_err; double pcnt_inter; double pcnt_motion; double pcnt_second_ref; @@ -38,10 +61,27 @@ typedef struct { int64_t spatial_layer_id; } FIRSTPASS_STATS; -struct twopass_rc { +typedef enum { + KF_UPDATE = 0, + LF_UPDATE = 1, + GF_UPDATE = 2, + ARF_UPDATE = 3, + OVERLAY_UPDATE = 4, + FRAME_UPDATE_TYPES = 5 +} FRAME_UPDATE_TYPE; + +typedef struct { + unsigned char index; + RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; + FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; + int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; +} GF_GROUP; + +typedef struct { unsigned int section_intra_rating; - unsigned int next_iiratio; - unsigned int this_iiratio; FIRSTPASS_STATS total_stats; FIRSTPASS_STATS this_frame_stats; const FIRSTPASS_STATS *stats_in; @@ -50,37 +90,32 @@ struct twopass_rc { FIRSTPASS_STATS total_left_stats; int first_pass_done; int64_t bits_left; - int64_t clip_bits_total; - double avg_iiratio; double modified_error_min; double modified_error_max; - double modified_error_total; double modified_error_left; double kf_intra_err_min; double gf_intra_err_min; - int kf_bits; - // Remaining error from uncoded frames in a gf group. Two pass use only - int64_t gf_group_error_left; + +#if CONFIG_FP_MB_STATS + uint8_t *frame_mb_stats_buf; + uint8_t *this_frame_mb_stats; + FIRSTPASS_MB_STATS firstpass_mb_stats; +#endif // Projected total bits available for a key frame group of frames int64_t kf_group_bits; // Error score of frames still to be coded in kf group int64_t kf_group_error_left; - - // Projected Bits available for a group of frames including 1 GF or ARF - int64_t gf_group_bits; - // Bits for the golden frame or ARF - 2 pass only - int gf_bits; - int alt_extra_bits; - int sr_update_lag; int kf_zeromotion_pct; int gf_zeromotion_pct; int active_worst_quality; -}; + + GF_GROUP gf_group; +} TWO_PASS; struct VP9_COMP; @@ -91,8 +126,6 @@ void vp9_end_first_pass(struct VP9_COMP *cpi); void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); -int vp9_twopass_worst_quality(struct VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh); // Post encode update of the rate control parameters for 2-pass void vp9_twopass_postencode_update(struct VP9_COMP *cpi); diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c index cf03e0142..e7435170e 100644 --- a/libvpx/vp9/encoder/vp9_lookahead.c +++ b/libvpx/vp9/encoder/vp9_lookahead.c @@ -14,18 +14,9 @@ #include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_lookahead.h" -#include "vp9/encoder/vp9_onyx_int.h" - -struct lookahead_ctx { - unsigned int max_sz; /* Absolute size of the queue */ - unsigned int sz; /* Number of buffers currently in the queue */ - unsigned int read_idx; /* Read index */ - unsigned int write_idx; /* Write index */ - struct lookahead_entry *buf; /* Buffer list */ -}; - /* Return the buffer at the given absolute index and increment the index */ static struct lookahead_entry *pop(struct lookahead_ctx *ctx, diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h index 046c533cc..678c51a1b 100644 --- a/libvpx/vp9/encoder/vp9_lookahead.h +++ b/libvpx/vp9/encoder/vp9_lookahead.h @@ -14,24 +14,38 @@ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" +#if CONFIG_SPATIAL_SVC +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#endif + #ifdef __cplusplus extern "C" { #endif #define MAX_LAG_BUFFERS 25 -// The max of past frames we want to keep in the queue. -#define MAX_PRE_FRAMES 1 - struct lookahead_entry { YV12_BUFFER_CONFIG img; int64_t ts_start; int64_t ts_end; unsigned int flags; + +#if CONFIG_SPATIAL_SVC + vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS]; +#endif }; +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 -struct lookahead_ctx; +struct lookahead_ctx { + unsigned int max_sz; /* Absolute size of the queue */ + unsigned int sz; /* Number of buffers currently in the queue */ + unsigned int read_idx; /* Read index */ + unsigned int write_idx; /* Write index */ + struct lookahead_entry *buf; /* Buffer list */ +}; /**\brief Initializes the lookahead stage * diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c index 44b171fd1..6e04e2a9c 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libvpx/vp9/encoder/vp9_mbgraph.c @@ -11,7 +11,6 @@ #include <limits.h> #include "vpx_mem/vpx_mem.h" -#include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/common/vp9_blockd.h" @@ -20,15 +19,15 @@ #include "vp9/common/vp9_systemdependent.h" - static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, MV *dst_mv, int mb_row, int mb_col) { - MACROBLOCK *const x = &cpi->mb; + MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; + const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; const int tmp_col_min = x->mv_col_min; const int tmp_col_max = x->mv_col_max; @@ -37,9 +36,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, MV ref_full; // Further step/diamond searches as necessary - int step_param = cpi->sf.reduce_first_step_size + - (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2); - step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); + int step_param = mv_sf->reduce_first_step_size; + step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2); vp9_set_mv_search_range(x, ref_mv); @@ -57,8 +55,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, unsigned int sse; cpi->find_fractional_mv_step( x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, - &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion, - &sse); + &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion, + &sse, NULL, 0, 0); } xd->mi[0]->mbmi.mode = NEWMV; @@ -73,44 +71,40 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x->mv_row_max = tmp_row_max; return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - INT_MAX); + xd->plane[0].dst.buf, xd->plane[0].dst.stride); } -static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv, +static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, int_mv *dst_mv, int mb_row, int mb_col) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; unsigned int err, tmp_err; - int_mv tmp_mv; + MV tmp_mv; // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - INT_MAX); + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; // Test last reference frame using the previous best mv as the // starting point (best reference) for the search - tmp_err = do_16x16_motion_iteration(cpi, &ref_mv->as_mv, &tmp_mv.as_mv, - mb_row, mb_col); + tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col); if (tmp_err < err) { err = tmp_err; - dst_mv->as_int = tmp_mv.as_int; + dst_mv->as_mv = tmp_mv; } // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. - if (ref_mv->as_int) { + if (ref_mv->row != 0 || ref_mv->col != 0) { unsigned int tmp_err; - int_mv zero_ref_mv, tmp_mv; + MV zero_ref_mv = {0, 0}, tmp_mv; - zero_ref_mv.as_int = 0; - tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv.as_mv, &tmp_mv.as_mv, + tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col); if (tmp_err < err) { - dst_mv->as_int = tmp_mv.as_int; + dst_mv->as_mv = tmp_mv; err = tmp_err; } } @@ -126,18 +120,16 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) { // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - INT_MAX); + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; return err; } -static int find_best_16x16_intra(VP9_COMP *cpi, - MB_PREDICTION_MODE *pbest_mode) { +static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_PREDICTION_MODE best_mode = -1, mode; + PREDICTION_MODE best_mode = -1, mode; unsigned int best_err = INT_MAX; // calculate SATD for each intra prediction mode; @@ -151,7 +143,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0, 0); err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err); + xd->plane[0].dst.buf, xd->plane[0].dst.stride); // find best if (err < best_err) { @@ -173,7 +165,7 @@ static void update_mbgraph_mb_stats YV12_BUFFER_CONFIG *buf, int mb_y_offset, YV12_BUFFER_CONFIG *golden_ref, - int_mv *prev_golden_ref_mv, + const MV *prev_golden_ref_mv, YV12_BUFFER_CONFIG *alt_ref, int mb_row, int mb_col @@ -239,13 +231,12 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, int mb_col, mb_row, offset = 0; int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; - int_mv arf_top_mv, gld_top_mv; - MODE_INFO mi_local = { { 0 } }; + MV gld_top_mv = {0, 0}; + MODE_INFO mi_local; + vp9_zero(mi_local); // Set up limit values for motion vectors to prevent them extending outside // the UMV borders. - arf_top_mv.as_int = 0; - gld_top_mv.as_int = 0; x->mv_row_min = -BORDER_MV_PIXELS_B16; x->mv_row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16; xd->up_available = 0; @@ -258,15 +249,13 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, mi_local.mbmi.ref_frame[1] = NONE; for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - int_mv arf_left_mv, gld_left_mv; + MV gld_left_mv = gld_top_mv; int mb_y_in_offset = mb_y_offset; int arf_y_in_offset = arf_y_offset; int gld_y_in_offset = gld_y_offset; // Set up limit values for motion vectors to prevent them extending outside // the UMV borders. - arf_left_mv.as_int = arf_top_mv.as_int; - gld_left_mv.as_int = gld_top_mv.as_int; x->mv_col_min = -BORDER_MV_PIXELS_B16; x->mv_col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16; xd->left_available = 0; @@ -277,11 +266,9 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref, &gld_left_mv, alt_ref, mb_row, mb_col); - arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int; - gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int; + gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv; if (mb_col == 0) { - arf_top_mv.as_int = arf_left_mv.as_int; - gld_top_mv.as_int = gld_left_mv.as_int; + gld_top_mv = gld_left_mv; } xd->left_available = 1; mb_y_in_offset += 16; diff --git a/libvpx/vp9/encoder/vp9_mbgraph.h b/libvpx/vp9/encoder/vp9_mbgraph.h index bc2a7048f..c3af972bc 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.h +++ b/libvpx/vp9/encoder/vp9_mbgraph.h @@ -20,7 +20,7 @@ typedef struct { int err; union { int_mv mv; - MB_PREDICTION_MODE mode; + PREDICTION_MODE mode; } m; } ref[MAX_REF_FRAMES]; } MBGRAPH_MB_STATS; diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c index f7a02a4a7..ae924d596 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libvpx/vp9/encoder/vp9_mcomp.c @@ -18,7 +18,7 @@ #include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_mcomp.h" // #define NEW_DIAMOND_SEARCH @@ -51,22 +51,20 @@ void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) { x->mv_row_max = row_max; } -int vp9_init_search_range(VP9_COMP *cpi, int size) { +int vp9_init_search_range(int size) { int sr = 0; - // Minimum search size no matter what the passed in value. size = MAX(16, size); while ((size << sr) < MAX_FULL_PEL_VAL) sr++; - sr += cpi->sf.reduce_first_step_size; - sr = MIN(sr, (cpi->sf.max_step_search_steps - 2)); + sr = MIN(sr, MAX_MVSEARCH_STEPS - 2); return sr; } static INLINE int mv_cost(const MV *mv, - const int *joint_cost, int *comp_cost[2]) { + const int *joint_cost, int *const comp_cost[2]) { return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + comp_cost[1][mv->col]; } @@ -90,44 +88,43 @@ static int mv_err_cost(const MV *mv, const MV *ref, return 0; } -static int mvsad_err_cost(const MV *mv, const MV *ref, - const int *mvjsadcost, int *mvsadcost[2], +static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, int error_per_bit) { - if (mvsadcost) { + if (x->nmvsadcost) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; - return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) * - error_per_bit, 8); + return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost, + x->nmvsadcost) * error_per_bit, 8); } return 0; } -void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { +void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { int len, ss_count = 1; - x->ss[0].mv.col = x->ss[0].mv.row = 0; - x->ss[0].offset = 0; + cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0; + cfg->ss[0].offset = 0; for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Generate offsets for 4 search sites per step. const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}}; int i; for (i = 0; i < 4; ++i) { - search_site *const ss = &x->ss[ss_count++]; + search_site *const ss = &cfg->ss[ss_count++]; ss->mv = ss_mvs[i]; ss->offset = ss->mv.row * stride + ss->mv.col; } } - x->ss_count = ss_count; - x->searches_per_step = 4; + cfg->ss_count = ss_count; + cfg->searches_per_step = 4; } -void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { +void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { int len, ss_count = 1; - x->ss[0].mv.col = x->ss[0].mv.row = 0; - x->ss[0].offset = 0; + cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0; + cfg->ss[0].offset = 0; for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Generate offsets for 8 search sites per step. @@ -137,14 +134,14 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { }; int i; for (i = 0; i < 8; ++i) { - search_site *const ss = &x->ss[ss_count++]; + search_site *const ss = &cfg->ss[ss_count++]; ss->mv = ss_mvs[i]; ss->offset = ss->mv.row * stride + ss->mv.col; } } - x->ss_count = ss_count; - x->searches_per_step = 8; + cfg->ss_count = ss_count; + cfg->searches_per_step = 8; } /* @@ -170,20 +167,19 @@ static INLINE int sp(int x) { return (x & 7) << 1; } -static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c, - int offset) { - return &buf[(r >> 3) * stride + (c >> 3) - offset]; +static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { + return &buf[(r >> 3) * stride + (c >> 3)]; } -/* returns subpixel variance error function */ -#define DIST(r, c) \ - vfp->svf(pre(y, y_stride, r, c, offset), y_stride, sp(c), sp(r), z, \ - src_stride, &sse) - /* checks if (r, c) has better score than previous best */ #define CHECK_BETTER(v, r, c) \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - thismse = (DIST(r, c)); \ + if (second_pred == NULL) \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + z, src_stride, &sse, second_pred); \ if ((v = MVC(r, c) + thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -269,106 +265,10 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1) { - const uint8_t *z = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - const MACROBLOCKD *xd = &x->e_mbd; - unsigned int besterr = INT_MAX; - unsigned int sse; - unsigned int whichdir; - int thismse; - unsigned int halfiters = iters_per_step; - unsigned int quarteriters = iters_per_step; - unsigned int eighthiters = iters_per_step; - - const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->row * y_stride + bestmv->col; - const uint8_t *y = xd->plane[0].pre[0].buf + offset; - - int rr = ref_mv->row; - int rc = ref_mv->col; - int br = bestmv->row * 8; - int bc = bestmv->col * 8; - int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - - int tr = br; - int tc = bc; - - // central mv - bestmv->row *= 8; - bestmv->col *= 8; - - // calculate central point error - besterr = vfp->vf(y, y_stride, z, src_stride, sse1); - *distortion = besterr; - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - - // 1/2 pel - FIRST_LEVEL_CHECKS; - if (halfiters > 1) { - SECOND_LEVEL_CHECKS; - } - tr = br; - tc = bc; - - // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only - if (forced_stop != 2) { - hstep >>= 1; - FIRST_LEVEL_CHECKS; - if (quarteriters > 1) { - SECOND_LEVEL_CHECKS; - } - tr = br; - tc = bc; - } - - if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { - hstep >>= 1; - FIRST_LEVEL_CHECKS; - if (eighthiters > 1) { - SECOND_LEVEL_CHECKS; - } - tr = br; - tc = bc; - } - // These lines insure static analysis doesn't warn that - // tr and tc aren't used after the above point. - (void) tr; - (void) tc; - - bestmv->row = br; - bestmv->col = bc; - - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; - - return besterr; -} - -#undef DIST -/* returns subpixel variance error function */ -#define DIST(r, c) \ - vfp->svaf(pre(y, y_stride, r, c, offset), y_stride, sp(c), sp(r), \ - z, src_stride, &sse, second_pred) - -int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, - MV *bestmv, const MV *ref_mv, - int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, - int iters_per_step, - int *mvjcost, int *mvcost[2], - int *distortion, - unsigned int *sse1, - const uint8_t *second_pred, - int w, int h) { - const uint8_t *z = x->plane[0].src.buf; + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { + const uint8_t *const z = x->plane[0].src.buf; const int src_stride = x->plane[0].src.stride; const MACROBLOCKD *xd = &x->e_mbd; unsigned int besterr = INT_MAX; @@ -379,10 +279,9 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, const unsigned int quarteriters = iters_per_step; const unsigned int eighthiters = iters_per_step; - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); const int y_stride = xd->plane[0].pre[0].stride; const int offset = bestmv->row * y_stride + bestmv->col; - const uint8_t *y = xd->plane[0].pre[0].buf + offset; + const uint8_t *const y = xd->plane[0].pre[0].buf; int rr = ref_mv->row; int rc = ref_mv->col; @@ -404,8 +303,13 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, // calculate central point error // TODO(yunqingwang): central pointer error was already calculated in full- // pixel search, and can be passed in this function. - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); - besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + if (second_pred != NULL) { + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + } else { + besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); + } *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -459,7 +363,6 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, #undef MVC #undef PRE -#undef DIST #undef CHECK_BETTER static INLINE int check_bounds(const MACROBLOCK *x, int row, int col, @@ -479,8 +382,7 @@ static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) { {\ if (thissad < bestsad) {\ if (use_mvcost) \ - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \ - mvjsadcost, mvsadcost, sad_per_bit);\ + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);\ if (thissad < bestsad) {\ bestsad = thissad;\ best_site = i;\ @@ -520,9 +422,6 @@ static int vp9_pattern_search(const MACROBLOCK *x, int k = -1; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; int best_init_s = search_param_to_steps[search_param]; - const int *const mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - // adjust ref_mv to make sure it is within MV range clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); br = ref_mv->row; @@ -530,9 +429,8 @@ static int vp9_pattern_search(const MACROBLOCK *x, // Work out the start point for the search bestsad = vfp->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, ref_mv), in_what->stride, - 0x7fffffff) + mvsad_err_cost(ref_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); // Search all possible scales upto the search param around the center point // pick the scale of the point that is best as the starting scale of @@ -548,7 +446,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, bc + candidates[t][i].col}; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } else { @@ -559,7 +457,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, continue; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } @@ -591,7 +489,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, bc + candidates[s][i].col}; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } else { @@ -602,7 +500,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, continue; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } @@ -629,7 +527,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, bc + candidates[s][next_chkpts_indices[i]].col}; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } else { @@ -640,7 +538,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, continue; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } @@ -667,7 +565,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, bc + neighbors[i].col}; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } else { @@ -678,7 +576,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, continue; thissad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), - in_what->stride, bestsad); + in_what->stride); CHECK_BETTER } } @@ -877,201 +775,84 @@ int vp9_fast_dia_search(const MACROBLOCK *x, #undef CHECK_BETTER -int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, +int vp9_full_range_search_c(const MACROBLOCK *x, + const search_site_config *cfg, + MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; - const uint8_t *what = x->plane[0].src.buf; - const int what_stride = x->plane[0].src.stride; - const uint8_t *in_what; - const int in_what_stride = xd->plane[0].pre[0].stride; - - unsigned int bestsad = INT_MAX; - int ref_row, ref_col; - - unsigned int thissad; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int range = 64; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - int tr, tc; - int best_tr = 0; - int best_tc = 0; - int range = 64; - - int start_col, end_col; - int start_row, end_row; - int i; + // The cfg and search_param parameters are not used in this search variant + (void)cfg; + (void)search_param; clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - ref_row = ref_mv->row; - ref_col = ref_mv->col; + *best_mv = *ref_mv; *num00 = 11; - best_mv->row = ref_row; - best_mv->col = ref_col; - - // Work out the start point for the search - in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; - - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - start_row = MAX(-range, x->mv_row_min - ref_row); - start_col = MAX(-range, x->mv_col_min - ref_col); - end_row = MIN(range, x->mv_row_max - ref_row); - end_col = MIN(range, x->mv_col_max - ref_col); - - for (tr = start_row; tr <= end_row; ++tr) { - for (tc = start_col; tc <= end_col; tc += 4) { - if ((tc + 3) <= end_col) { - unsigned int sad_array[4]; - unsigned char const *addr_ref[4]; - for (i = 0; i < 4; ++i) - addr_ref[i] = in_what + tr * in_what_stride + tc + i; - - fn_ptr->sdx4df(what, what_stride, addr_ref, in_what_stride, sad_array); - + best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + start_row = MAX(-range, x->mv_row_min - ref_mv->row); + start_col = MAX(-range, x->mv_col_min - ref_mv->col); + end_row = MIN(range, x->mv_row_max - ref_mv->row); + end_col = MIN(range, x->mv_col_max - ref_mv->col); + + for (r = start_row; r <= end_row; ++r) { + for (c = start_col; c <= end_col; c += 4) { + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; for (i = 0; i < 4; ++i) { - if (sad_array[i] < bestsad) { - const MV this_mv = {ref_row + tr, ref_col + tc + i}; - thissad = sad_array[i] + - mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - if (thissad < bestsad) { - bestsad = thissad; - best_tr = tr; - best_tc = tc + i; - } - } + const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; + addrs[i] = get_buf_from_mv(in_what, &mv); } - } else { - for (i = 0; i < end_col - tc; ++i) { - const uint8_t *check_here = in_what + tr * in_what_stride + tc + i; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - if (thissad < bestsad) { - const MV this_mv = {ref_row + tr, ref_col + tc + i}; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); - if (thissad < bestsad) { - bestsad = thissad; - best_tr = tr; - best_tc = tc + i; + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; + const unsigned int sad = sads[i] + + mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } } } - } - } - } - best_mv->row += best_tr; - best_mv->col += best_tc; - return bestsad; -} - -int vp9_diamond_search_sad_c(const MACROBLOCK *x, - MV *ref_mv, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - const MV *center_mv) { - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - // search_param determines the length of the initial step and hence the number - // of iterations - // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = - // (MAX_FIRST_STEP/4) pel... etc. - const search_site *const ss = &x->ss[search_param * x->searches_per_step]; - const int tot_steps = (x->ss_count / x->searches_per_step) - search_param; - const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - const uint8_t *best_address; - int best_sad = INT_MAX; - int best_site = 0; - int last_site = 0; - int i, j, step; - - clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - best_address = get_buf_from_mv(in_what, ref_mv); - *num00 = 0; - *best_mv = *ref_mv; - - // Check the starting position - best_sad = fn_ptr->sdf(what->buf, what->stride, - in_what->buf, in_what->stride, 0x7fffffff) + - mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); - - i = 1; - - for (step = 0; step < tot_steps; step++) { - for (j = 0; j < x->searches_per_step; j++) { - const MV mv = {best_mv->row + ss[i].mv.row, - best_mv->col + ss[i].mv.col}; - if (is_mv_in(x, &mv)) { - int sad = fn_ptr->sdf(what->buf, what->stride, - best_address + ss[i].offset, in_what->stride, - best_sad); - if (sad < best_sad) { - sad += mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - best_site = i; - } - } - } - - i++; - } - - if (best_site != last_site) { - best_mv->row += ss[best_site].mv.row; - best_mv->col += ss[best_site].mv.col; - best_address += ss[best_site].offset; - last_site = best_site; -#if defined(NEW_DIAMOND_SEARCH) - while (1) { - const MV this_mv = {best_mv->row + ss[best_site].mv.row, - best_mv->col + ss[best_site].mv.col}; - if (is_mv_in(x, &this_mv)) { - int sad = fn_ptr->sdf(what->buf, what->stride, - best_address + ss[best_site].offset, - in_what->stride, best_sad); + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); if (sad < best_sad) { - sad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); if (sad < best_sad) { best_sad = sad; - best_mv->row += ss[best_site].mv.row; - best_mv->col += ss[best_site].mv.col; - best_address += ss[best_site].offset; - continue; + *best_mv = mv; } } } - break; - }; -#endif - } else if (best_address == in_what->buf) { - (*num00)++; + } } } + return best_sad; } -int vp9_diamond_search_sadx4(const MACROBLOCK *x, +int vp9_diamond_search_sad_c(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv) { int i, j, step; @@ -1094,14 +875,10 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... - const search_site *ss = &x->ss[search_param * x->searches_per_step]; - const int tot_steps = (x->ss_count / x->searches_per_step) - search_param; + const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); ref_row = ref_mv->row; ref_col = ref_mv->col; @@ -1114,9 +891,8 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, best_address = in_what; // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); i = 1; @@ -1136,7 +912,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, if (all_in) { unsigned int sad_array[4]; - for (j = 0; j < x->searches_per_step; j += 4) { + for (j = 0; j < cfg->searches_per_step; j += 4) { unsigned char const *block_offset[4]; for (t = 0; t < 4; t++) @@ -1149,9 +925,8 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, if (sad_array[t] < bestsad) { const MV this_mv = {best_mv->row + ss[i].mv.row, best_mv->col + ss[i].mv.col}; - sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - + sad_array[t] += mvsad_err_cost(x, &this_mv, &fcenter_mv, + sad_per_bit); if (sad_array[t] < bestsad) { bestsad = sad_array[t]; best_site = i; @@ -1160,7 +935,7 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, } } } else { - for (j = 0; j < x->searches_per_step; j++) { + for (j = 0; j < cfg->searches_per_step; j++) { // Trap illegal vectors const MV this_mv = {best_mv->row + ss[i].mv.row, best_mv->col + ss[i].mv.col}; @@ -1168,12 +943,10 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, if (is_mv_in(x, &this_mv)) { const uint8_t *const check_here = ss[i].offset + best_address; unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here, - in_what_stride, bestsad); + in_what_stride); if (thissad < bestsad) { - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; best_site = i; @@ -1195,10 +968,9 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, if (is_mv_in(x, &this_mv)) { const uint8_t *const check_here = ss[best_site].offset + best_address; unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here, - in_what_stride, bestsad); + in_what_stride); if (thissad < bestsad) { - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; best_mv->row += ss[best_site].mv.row; @@ -1229,10 +1001,9 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; - int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, + int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, step_param, sadpb, &n, - fn_ptr, x->nmvjointcost, - x->mvcost, ref_mv); + fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -1248,10 +1019,9 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, step_param + n, sadpb, &num00, - fn_ptr, x->nmvjointcost, x->mvcost, - ref_mv); + fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); @@ -1271,8 +1041,7 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, const int search_range = 8; MV best_mv = *dst_mv; thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range, - fn_ptr, x->nmvjointcost, x->mvcost, - ref_mv); + fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); if (thissme < bestsme) { @@ -1286,7 +1055,6 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, MV *best_mv) { int r, c; const MACROBLOCKD *const xd = &x->e_mbd; @@ -1296,22 +1064,18 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; int best_sad = fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); *best_mv = *ref_mv; for (r = row_min; r < row_max; ++r) { for (c = col_min; c < col_max; ++c) { const MV mv = {r, c}; const int sad = fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) + - mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); - + get_buf_from_mv(in_what, &mv), in_what->stride) + + mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); if (sad < best_sad) { best_sad = sad; *best_mv = mv; @@ -1324,281 +1088,171 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, MV *best_mv) { + int r; const MACROBLOCKD *const xd = &x->e_mbd; - const uint8_t *const what = x->plane[0].src.buf; - const int what_stride = x->plane[0].src.stride; - const uint8_t *const in_what = xd->plane[0].pre[0].buf; - const int in_what_stride = xd->plane[0].pre[0].stride; - MV this_mv; - unsigned int bestsad = INT_MAX; - int r, c; - unsigned int thissad; - int ref_row = ref_mv->row; - int ref_col = ref_mv->col; - - // Apply further limits to prevent us looking using vectors that stretch - // beyond the UMV border - const int row_min = MAX(ref_row - distance, x->mv_row_min); - const int row_max = MIN(ref_row + distance, x->mv_row_max); - const int col_min = MAX(ref_col - distance, x->mv_col_min); - const int col_max = MIN(ref_col + distance, x->mv_col_max); - unsigned int sad_array[3]; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = MAX(ref_mv->row - distance, x->mv_row_min); + const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); + const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); + const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - // Work out the mid point for the search - const uint8_t *bestaddress = &in_what[ref_row * in_what_stride + ref_col]; - - best_mv->row = ref_row; - best_mv->col = ref_col; - - // Baseline value at the centre - bestsad = fn_ptr->sdf(what, what_stride, - bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - for (r = row_min; r < row_max; r++) { - const uint8_t *check_here = &in_what[r * in_what_stride + col_min]; - this_mv.row = r; - c = col_min; - - while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) { - int i; + unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); + for (r = row_min; r < row_max; ++r) { + int c = col_min; + const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - for (i = 0; i < 3; i++) { - thissad = sad_array[i]; + if (fn_ptr->sdx3f != NULL) { + while ((c + 2) < col_max) { + int i; + unsigned int sads[3]; - if (thissad < bestsad) { - this_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, + sads); - if (thissad < bestsad) { - bestsad = thissad; - best_mv->row = r; - best_mv->col = c; + for (i = 0; i < 3; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } + ++check_here; + ++c; } - check_here++; - c++; } } while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - if (thissad < bestsad) { - this_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->row = r; - best_mv->col = c; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + check_here, in_what->stride); + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } } - - check_here++; - c++; + ++check_here; + ++c; } } - return bestsad; + + return best_sad; } int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, MV *best_mv) { + int r; const MACROBLOCKD *const xd = &x->e_mbd; - const uint8_t *const what = x->plane[0].src.buf; - const int what_stride = x->plane[0].src.stride; - const uint8_t *const in_what = xd->plane[0].pre[0].buf; - const int in_what_stride = xd->plane[0].pre[0].stride; - MV this_mv; - unsigned int bestsad = INT_MAX; - int r, c; - int ref_row = ref_mv->row; - int ref_col = ref_mv->col; - - // Apply further limits to prevent us looking using vectors that stretch - // beyond the UMV border - const int row_min = MAX(ref_row - distance, x->mv_row_min); - const int row_max = MIN(ref_row + distance, x->mv_row_max); - const int col_min = MAX(ref_col - distance, x->mv_col_min); - const int col_max = MIN(ref_col + distance, x->mv_col_max); - DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8); - unsigned int sad_array[3]; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = MAX(ref_mv->row - distance, x->mv_row_min); + const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); + const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); + const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - // Work out the mid point for the search - const uint8_t *bestaddress = &in_what[ref_row * in_what_stride + ref_col]; - - best_mv->row = ref_row; - best_mv->col = ref_col; - - // Baseline value at the center - bestsad = fn_ptr->sdf(what, what_stride, - bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - for (r = row_min; r < row_max; r++) { - const uint8_t *check_here = &in_what[r * in_what_stride + col_min]; - this_mv.row = r; - c = col_min; - - while ((c + 7) < col_max) { - int i; - - fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8); + for (r = row_min; r < row_max; ++r) { + int c = col_min; + const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - for (i = 0; i < 8; i++) { - unsigned int thissad = (unsigned int)sad_array8[i]; + if (fn_ptr->sdx8f != NULL) { + while ((c + 7) < col_max) { + int i; + unsigned int sads[8]; - if (thissad < bestsad) { - this_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, + sads); - if (thissad < bestsad) { - bestsad = thissad; - best_mv->row = r; - best_mv->col = c; + for (i = 0; i < 8; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } + ++check_here; + ++c; } - - check_here++; - c++; } } - while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) { - int i; - - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); + if (fn_ptr->sdx3f != NULL) { + while ((c + 2) < col_max) { + int i; + unsigned int sads[3]; - for (i = 0; i < 3; i++) { - unsigned int thissad = sad_array[i]; + fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, + sads); - if (thissad < bestsad) { - this_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->row = r; - best_mv->col = c; + for (i = 0; i < 3; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } + ++check_here; + ++c; } - - check_here++; - c++; } } while (c < col_max) { - unsigned int thissad = fn_ptr->sdf(what, what_stride, - check_here, in_what_stride, bestsad); - - if (thissad < bestsad) { - this_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->row = r; - best_mv->col = c; - } - } - - check_here++; - c++; - } - } - return bestsad; -} - -int vp9_refining_search_sad_c(const MACROBLOCK *x, - MV *ref_mv, int error_per_bit, - int search_range, - const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - const MV *center_mv) { - const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, ref_mv), - in_what->stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); - int i, j; - - for (i = 0; i < search_range; i++) { - int best_site = -1; - - for (j = 0; j < 4; j++) { - const MV mv = {ref_mv->row + neighbors[j].row, - ref_mv->col + neighbors[j].col}; - if (is_mv_in(x, &mv)) { - unsigned int sad = fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride, best_sad); + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + check_here, in_what->stride); + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); if (sad < best_sad) { - sad += mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost, - error_per_bit); - if (sad < best_sad) { - best_sad = sad; - best_site = j; - } + best_sad = sad; + *best_mv = mv; } } - } - - if (best_site == -1) { - break; - } else { - ref_mv->row += neighbors[best_site].row; - ref_mv->col += neighbors[best_site].col; + ++check_here; + ++c; } } + return best_sad; } -int vp9_refining_search_sadx4(const MACROBLOCK *x, +int vp9_refining_search_sad_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address, - in_what->stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; for (i = 0; i < search_range; i++) { @@ -1623,9 +1277,7 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x, if (sads[j] < best_sad) { const MV mv = {ref_mv->row + neighbors[j].row, ref_mv->col + neighbors[j].col}; - sads[j] += mvsad_err_cost(&mv, &fcenter_mv, - mvjsadcost, mvsadcost, error_per_bit); - + sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sads[j] < best_sad) { best_sad = sads[j]; best_site = j; @@ -1640,11 +1292,9 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x, if (is_mv_in(x, &mv)) { unsigned int sad = fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), - in_what->stride, best_sad); + in_what->stride); if (sad < best_sad) { - sad += mvsad_err_cost(&mv, &fcenter_mv, - mvjsadcost, mvsadcost, error_per_bit); - + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { best_sad = sad; best_site = j; @@ -1672,21 +1322,17 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, - const uint8_t *second_pred, int w, int h) { + const uint8_t *second_pred) { const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, {-1, -1}, {1, -1}, {-1, 1}, {1, 1}}; const MACROBLOCKD *const xd = &x->e_mbd; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - const int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride, - get_buf_from_mv(in_what, ref_mv), in_what->stride, - second_pred, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; for (i = 0; i < search_range; ++i) { @@ -1698,11 +1344,9 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, if (is_mv_in(x, &mv)) { unsigned int sad = fn_ptr->sdaf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride, - second_pred, best_sad); + get_buf_from_mv(in_what, &mv), in_what->stride, second_pred); if (sad < best_sad) { - sad += mvsad_err_cost(&mv, &fcenter_mv, - mvjsadcost, mvsadcost, error_per_bit); + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { best_sad = sad; best_site = j; @@ -1720,3 +1364,49 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, } return best_sad; } + +int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd) { + const SPEED_FEATURES *const sf = &cpi->sf; + const SEARCH_METHODS method = sf->mv.search_method; + vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + int var = 0; + + switch (method) { + case FAST_DIAMOND: + var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case FAST_HEX: + var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case HEX: + var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case SQUARE: + var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case BIGDIA: + var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case NSTEP: + var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, + 1, fn_ptr, ref_mv, tmp_mv); + break; + default: + assert(!"Invalid search method."); + } + + if (method != NSTEP && rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); + + return var; +} diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h index f7b7c5e49..298fbb6c9 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libvpx/vp9/encoder/vp9_mcomp.h @@ -31,6 +31,20 @@ extern "C" { // for Block_16x16 #define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND) +// motion search site +typedef struct search_site { + MV mv; + int offset; +} search_site; + +typedef struct search_site_config { + search_site ss[8 * MAX_MVSEARCH_STEPS + 1]; + int ss_count; + int searches_per_step; +} search_site_config; + +void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride); +void vp9_init3smotion_compensation(search_site_config *cfg, int stride); void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv); int vp9_mv_bit_cost(const MV *mv, const MV *ref, @@ -46,11 +60,11 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const uint8_t *second_pred, const vp9_variance_fn_ptr_t *vfp, int use_mvcost); -void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); -void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); struct VP9_COMP; -int vp9_init_search_range(struct VP9_COMP *cpi, int size); +struct SPEED_FEATURES; + +int vp9_init_search_range(int size); // Runs sequence of diamond searches in smaller steps for RD int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x, @@ -84,57 +98,46 @@ typedef int (fractional_mv_step_fp) ( const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, - int *mvjcost, - int *mvcost[2], - int *distortion, - unsigned int *sse); - -extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; - -typedef int (fractional_mv_step_comp_fp) ( - const MACROBLOCK *x, - MV *bestmv, const MV *ref_mv, - int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, // 0 - full, 1 - qtr only, 2 - half only - int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w, int h); -extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, MV *best_mv); typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, int sad_per_bit, int distance, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv); typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - const MV *center_mv, const uint8_t *second_pred, - int w, int h); + const MV *center_mv, const uint8_t *second_pred); + +struct VP9_COMP; + +int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd); #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c index 3ac85228b..d36548996 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libvpx/vp9/encoder/vp9_picklpf.c @@ -19,13 +19,17 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_quantize.h" -static int get_max_filter_level(VP9_COMP *cpi) { - return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 - : MAX_LOOP_FILTER; +static int get_max_filter_level(const VP9_COMP *cpi) { + if (cpi->oxcf.pass == 2) { + return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; + } } @@ -34,7 +38,8 @@ static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi, VP9_COMMON *const cm = &cpi->common; int filt_err; - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_level, 1, partial_frame); + vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1, + partial_frame); filt_err = vp9_get_y_sse(sd, cm->frame_to_show); // Re-instate the unfiltered frame @@ -43,15 +48,15 @@ static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi, return filt_err; } -static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, - int partial_frame) { - VP9_COMMON *const cm = &cpi->common; - struct loopfilter *const lf = &cm->lf; +static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, + int partial_frame) { + const VP9_COMMON *const cm = &cpi->common; + const struct loopfilter *const lf = &cm->lf; const int min_filter_level = 0; const int max_filter_level = get_max_filter_level(cpi); - int best_err; - int filt_best; int filt_direction = 0; + int best_err, filt_best; + // Start the search at the previous frame filter level unless it is now out of // range. int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); @@ -77,8 +82,8 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, // Bias against raising loop filter in favor of lowering it. int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; - if (cpi->twopass.section_intra_rating < 20) - bias = bias * cpi->twopass.section_intra_rating / 20; + if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->twopass.section_intra_rating) / 20; // yx, bias less for large block size if (cm->tx_mode != ONLY_4X4) @@ -128,7 +133,7 @@ static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, } } - lf->filter_level = filt_best; + return filt_best; } void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, @@ -139,7 +144,9 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; - if (method == LPF_PICK_FROM_Q) { + if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { + lf->filter_level = 0; + } else if (method >= LPF_PICK_FROM_Q) { const int min_filter_level = 0; const int max_filter_level = get_max_filter_level(cpi); const int q = vp9_ac_quant(cm->base_qindex, 0); @@ -150,6 +157,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, filt_guess -= 4; lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); } else { - search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE); + lf->filter_level = search_filter_level(sd, cpi, + method == LPF_PICK_FROM_SUBIMAGE); } } diff --git a/libvpx/vp9/encoder/vp9_picklpf.h b/libvpx/vp9/encoder/vp9_picklpf.h index 7d08ddb5f..33c490f69 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.h +++ b/libvpx/vp9/encoder/vp9_picklpf.h @@ -16,7 +16,7 @@ extern "C" { #endif -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" struct yv12_buffer_config; struct VP9_COMP; diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c index f3fe99cdb..6115f5a0f 100644 --- a/libvpx/vp9/encoder/vp9_pickmode.c +++ b/libvpx/vp9/encoder/vp9_pickmode.c @@ -22,31 +22,115 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" + +static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + + const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; + + int different_ref_found = 0; + int context_counter = 0; + int const_motion = 0; + + // Blank the reference vector list + vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // The nearest 2 blocks are treated differently + // if the size < 8x8 we get the mv from the bmi substructure, + // and we also need to keep a mode count. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]; + const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mode]; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1)); + } + } + + const_motion = 1; + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]->mbmi; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(candidate->mv[0]); + } + } -static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *tmp_mv, int *rate_mv) { + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found && !refmv_count) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row + * xd->mi_stride]->mbmi; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV(candidate); + } + } + } + + Done: + + mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter]; + + // Clamp vectors + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); + + return const_motion; +} + +static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv, int *rate_mv, + int64_t best_rd_sofar) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; - int step_param; - int sadpb = x->sadperbit16; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}}; + const int step_param = cpi->sf.mv.fullpel_search_step_param; + const int sadpb = x->sadperbit16; MV mvp_full; - int ref = mbmi->ref_frame[0]; + const int ref = mbmi->ref_frame[0]; const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv; - int i; - - int tmp_col_min = x->mv_col_min; - int tmp_col_max = x->mv_col_max; - int tmp_row_min = x->mv_row_min; - int tmp_row_max = x->mv_row_max; - + int dis; + int rate_mode; + const int tmp_col_min = x->mv_col_min; + const int tmp_col_max = x->mv_col_max; + const int tmp_row_min = x->mv_row_min; + const int tmp_row_max = x->mv_row_max; + int rv = 0; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); + if (cpi->common.show_frame && + (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME]) + return rv; + if (scaled_ref_frame) { int i; // Swap out the reference frame for a version that's been scaled to @@ -54,152 +138,217 @@ static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // motion search code to be used without additional modifications. for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; - vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); } - vp9_set_mv_search_range(x, &ref_mv); - // TODO(jingning) exploiting adaptive motion search control in non-RD - // mode decision too. - step_param = 6; - - for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) { - if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { - tmp_mv->as_int = INVALID_MV; - - if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[0] = backup_yv12[i]; - } - return; - } - } assert(x->mv_best_ref_index[ref] <= 2); if (x->mv_best_ref_index[ref] < 2) mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; else - mvp_full = x->pred_mv[ref].as_mv; + mvp_full = x->pred_mv[ref]; mvp_full.col >>= 3; mvp_full.row >>= 3; - if (cpi->sf.search_method == FAST_DIAMOND) { - // NOTE: this returns SAD - vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == FAST_HEX) { - // NOTE: this returns SAD - vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == HEX) { - // NOTE: this returns SAD - vp9_hex_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == SQUARE) { - // NOTE: this returns SAD - vp9_square_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == BIGDIA) { - // NOTE: this returns SAD - vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else { - int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - // NOTE: this returns variance - vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[bsize], - &ref_mv, &tmp_mv->as_mv); - } + vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv, + &tmp_mv->as_mv, INT_MAX, 0); + x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; x->mv_row_min = tmp_row_min; x->mv_row_max = tmp_row_max; - if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[0] = backup_yv12[i]; - } - // calculate the bit cost on motion vector mvp_full.row = tmp_mv->as_mv.row * 8; mvp_full.col = tmp_mv->as_mv.col * 8; + *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); -} - -static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - BLOCK_SIZE bsize, int mi_row, int mi_col, - MV *tmp_mv) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; - int ref = mbmi->ref_frame[0]; - MV ref_mv = mbmi->ref_mvs[ref][0].as_mv; - int dis; - - const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, - ref); - if (scaled_ref_frame) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[i] = xd->plane[i].pre[0]; - vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref]] + [INTER_OFFSET(NEWMV)]; + rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > + best_rd_sofar); + + if (rv) { + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + x->nmvjointcost, x->mvcost, + &dis, &x->pred_sse[ref], NULL, 0, 0); + x->pred_mv[ref] = tmp_mv->as_mv; } - cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv, - cpi->common.allow_high_precision_mv, - x->errorperbit, - &cpi->fn_ptr[bsize], - cpi->sf.subpel_force_stop, - cpi->sf.subpel_iters_per_step, - x->nmvjointcost, x->mvcost, - &dis, &x->pred_sse[ref]); - if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } - - x->pred_mv[ref].as_mv = *tmp_mv; + return rv; } + static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int64_t *out_dist_sum) { + int *out_rate_sum, int64_t *out_dist_sum, + unsigned int *var_y, unsigned int *sse_y) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. unsigned int sse; int rate; int64_t dist; - struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + *var_y = var; + *sse_y = sse; + + if (sse < dc_quant * dc_quant >> 6) + x->skip_txfm[0] = 1; + else if (var < ac_quant * ac_quant >> 6) + x->skip_txfm[0] = 2; + else + x->skip_txfm[0] = 0; + + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + xd->mi[0]->mbmi.tx_size = TX_8X8; + } else { + xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } - int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); - - vp9_model_rd_from_var_lapndz(sse + var, 1 << num_pels_log2_lookup[bsize], - pd->dequant[1] >> 3, &rate, &dist); - *out_rate_sum = rate; + vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); + *out_rate_sum = rate >> 1; *out_dist_sum = dist << 3; + + vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], + ac_quant >> 3, &rate, &dist); + *out_rate_sum += rate; + *out_dist_sum += dist << 4; +} + +static int get_pred_buffer(PRED_BUFFER *p, int len) { + int i; + + for (i = 0; i < len; i++) { + if (!p[i].in_use) { + p[i].in_use = 1; + return i; + } + } + return -1; +} + +static void free_pred_buffer(PRED_BUFFER *p) { + p->in_use = 0; +} + +static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + MV_REFERENCE_FRAME ref_frame, + PREDICTION_MODE this_mode, + unsigned int var_y, unsigned int sse_y, + struct buf_2d yv12_mb[][MAX_MB_PLANE], + int *rate, int64_t *dist) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var = var_y, sse = sse_y; + // Skipping threshold for ac. + unsigned int thresh_ac; + // Skipping threshold for dc. + unsigned int thresh_dc; + if (x->encode_breakout > 0) { + // Set a maximum for threshold to avoid big PSNR loss in low bit rate + // case. Use extreme low threshold for static frames to limit + // skipping. + const unsigned int max_thresh = 36000; + // The encode_breakout input + const unsigned int min_thresh = + MIN(((unsigned int)x->encode_breakout << 4), max_thresh); + + // Calculate threshold according to dequant value. + thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); + + // Adjust ac threshold according to partition size. + thresh_ac >>= + 8 - (b_width_log2(bsize) + b_height_log2(bsize)); + + thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); + } else { + thresh_ac = 0; + thresh_dc = 0; + } + + // Y skipping condition checking for ac and dc. + if (var <= thresh_ac && (sse - var) <= thresh_dc) { + unsigned int sse_u, sse_v; + unsigned int var_u, var_v; + + // Skip UV prediction unless breakout is zero (lossless) to save + // computation with low impact on the result + if (x->encode_breakout == 0) { + xd->plane[1].pre[0] = yv12_mb[ref_frame][1]; + xd->plane[2].pre[0] = yv12_mb[ref_frame][2]; + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); + } + + var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, + x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse_u); + + // U skipping condition checking + if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) { + var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, + x->plane[2].src.stride, + xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse_v); + + // V skipping condition checking + if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) { + x->skip = 1; + + // The cost of skip bit needs to be added. + *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + + // More on this part of rate + // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + // Scaling factor for SSE from spatial domain to frequency + // domain is 16. Adjust distortion accordingly. + // TODO(yunqingwang): In this function, only y-plane dist is + // calculated. + *dist = (sse << 4); // + ((sse_u + sse_v) << 4); + + // *disable_skip = 1; + } + } + } } +static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = { + {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV}, + {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG}, + {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA}, +}; + // TODO(jingning) placeholder for inter-frame non-RD mode decision. // this needs various further optimizations. to be continued.. int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, @@ -207,13 +356,16 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int *returnrate, int64_t *returndistortion, - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - MB_PREDICTION_MODE this_mode, best_mode = ZEROMV; + PREDICTION_MODE this_mode, best_mode = ZEROMV; MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; + TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; @@ -221,9 +373,12 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; int64_t this_rd = INT64_MAX; - + int skip_txfm = 0; int rate = INT_MAX; int64_t dist = INT64_MAX; + // var_y and sse_y are saved to be used in skipping checking + unsigned int var_y = UINT_MAX; + unsigned int sse_y = UINT_MAX; VP9_COMMON *cm = &cpi->common; int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); @@ -233,17 +388,46 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, const int64_t intra_mode_cost = 50; unsigned char segment_id = mbmi->segment_id; - const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize]; - const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize]; - // Mode index conversion form THR_MODES to MB_PREDICTION_MODE for a ref frame. - int mode_idx[MB_MODE_COUNT] = {0}; - INTERP_FILTER filter_ref = SWITCHABLE; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize]; + // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame. + INTERP_FILTER filter_ref = cm->interp_filter; + int bsl = mi_width_log2(bsize); + const int pred_filter_search = cm->interp_filter == SWITCHABLE ? + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; + int const_motion[MAX_REF_FRAMES] = { 0 }; + int bh = num_4x4_blocks_high_lookup[bsize] << 2; + int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + int pixels_in_block = bh * bw; + // For speed 6, the result of interp filter is reused later in actual encoding + // process. + // tmp[3] points to dst buffer, and the other 3 point to allocated buffers. + PRED_BUFFER tmp[4]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64); + struct buf_2d orig_dst = pd->dst; + PRED_BUFFER *best_pred = NULL; + PRED_BUFFER *this_mode_pred = NULL; + int i; - x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + // CTX is used by the temporal denoiser which is currently being developed. + // TODO(jbb): when temporal denoiser is finished and in the default build + // remove the following line; + (void) ctx; + if (cpi->sf.reuse_inter_pred_sby) { + for (i = 0; i < 3; i++) { + tmp[i].data = &pred_buf[pixels_in_block * i]; + tmp[i].stride = bw; + tmp[i].in_use = 0; + } + tmp[3].data = pd->dst.buf; + tmp[3].stride = pd->dst.stride; + tmp[3].in_use = 0; + } + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; x->skip = 0; - if (!x->in_active_map) - x->skip = 1; + // initialize mode decisions *returnrate = INT_MAX; *returndistortion = INT64_MAX; @@ -252,31 +436,46 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = NONE; mbmi->ref_frame[1] = NONE; mbmi->tx_size = MIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ? - EIGHTTAP : cpi->common.interp_filter; - mbmi->skip = 0; + tx_mode_to_biggest_tx_size[cm->tx_mode]); + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? + EIGHTTAP : cm->interp_filter; mbmi->segment_id = segment_id; - for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if (cpi->ref_frame_flags & flag_list[ref_frame]) { - vp9_setup_buffer_inter(cpi, x, tile, - ref_frame, bsize, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); - } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; - } - if (xd->up_available) - filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter; - else if (xd->left_available) - filter_ref = xd->mi[-1]->mbmi.interp_filter; + if (xd->up_available) + filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + else if (xd->left_available) + filter_ref = xd->mi[-1]->mbmi.interp_filter; - for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) { - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + int_mv *const candidates = mbmi->ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, + sf, sf); + + if (!cm->error_resilient_mode) + vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame, + candidates, mi_row, mi_col); + else + const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0], + ref_frame, candidates, + mi_row, mi_col); + + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &frame_mv[NEARESTMV][ref_frame], + &frame_mv[NEARMV][ref_frame]); + + if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) + vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, + ref_frame, bsize); + } else { continue; + } // Select prediction reference frames. xd->plane[0].pre[0] = yv12_mb[ref_frame][0]; @@ -286,49 +485,35 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = ref_frame; - // Set conversion index for LAST_FRAME. - if (ref_frame == LAST_FRAME) { - mode_idx[NEARESTMV] = THR_NEARESTMV; // LAST_FRAME, NEARESTMV - mode_idx[NEARMV] = THR_NEARMV; // LAST_FRAME, NEARMV - mode_idx[ZEROMV] = THR_ZEROMV; // LAST_FRAME, ZEROMV - mode_idx[NEWMV] = THR_NEWMV; // LAST_FRAME, NEWMV - } - for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { int rate_mv = 0; + int mode_rd_thresh; - if (cpi->sf.disable_inter_mode_mask[bsize] & - (1 << INTER_OFFSET(this_mode))) + if (const_motion[ref_frame] && + (this_mode == NEARMV || this_mode == ZEROMV)) continue; - if (best_rd < ((int64_t)rd_threshes[mode_idx[this_mode]] * - rd_thresh_freq_fact[this_mode] >> 5) || - rd_threshes[mode_idx[this_mode]] == INT_MAX) + if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) + continue; + + mode_rd_thresh = rd_threshes[mode_idx[ref_frame - LAST_FRAME] + [this_mode - NEARESTMV]]; + if (rd_less_than_thresh(best_rd, mode_rd_thresh, + rd_thresh_freq_fact[this_mode])) continue; if (this_mode == NEWMV) { - int rate_mode = 0; if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize])) continue; - - full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv); - - if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV) + if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], + &rate_mv, best_rd)) continue; - - rate_mode = x->inter_mode_cost[mbmi->mode_context[ref_frame]] - [INTER_OFFSET(this_mode)]; - if (RDCOST(x->rdmult, x->rddiv, rate_mv + rate_mode, 0) > best_rd) - continue; - - sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame].as_mv); } - if (this_mode != NEARESTMV) - if (frame_mv[this_mode][ref_frame].as_int == - frame_mv[NEARESTMV][ref_frame].as_int) + if (this_mode != NEARESTMV && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[NEARESTMV][ref_frame].as_int) continue; mbmi->mode = this_mode; @@ -337,103 +522,218 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Search for the best prediction filter type, when the resulting // motion vector is at sub-pixel accuracy level for luma component, i.e., // the last three bits are all zeros. + if (cpi->sf.reuse_inter_pred_sby) { + if (this_mode == NEARESTMV) { + this_mode_pred = &tmp[3]; + } else { + this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = this_mode_pred->data; + pd->dst.stride = bw; + } + } + if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && + pred_filter_search && ((mbmi->mv[0].as_mv.row & 0x07) != 0 || (mbmi->mv[0].as_mv.col & 0x07) != 0)) { - int64_t tmp_rdcost1 = INT64_MAX; - int64_t tmp_rdcost2 = INT64_MAX; - int64_t tmp_rdcost3 = INT64_MAX; int pf_rate[3]; int64_t pf_dist[3]; - - mbmi->interp_filter = EIGHTTAP; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP], - &pf_dist[EIGHTTAP]); - tmp_rdcost1 = RDCOST(x->rdmult, x->rddiv, - vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP], - pf_dist[EIGHTTAP]); - - mbmi->interp_filter = EIGHTTAP_SHARP; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SHARP], - &pf_dist[EIGHTTAP_SHARP]); - tmp_rdcost2 = RDCOST(x->rdmult, x->rddiv, - vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP_SHARP], - pf_dist[EIGHTTAP_SHARP]); - - mbmi->interp_filter = EIGHTTAP_SMOOTH; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SMOOTH], - &pf_dist[EIGHTTAP_SMOOTH]); - tmp_rdcost3 = RDCOST(x->rdmult, x->rddiv, - vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP_SMOOTH], - pf_dist[EIGHTTAP_SMOOTH]); - - if (tmp_rdcost2 < tmp_rdcost1) { - if (tmp_rdcost2 < tmp_rdcost3) - mbmi->interp_filter = EIGHTTAP_SHARP; - else - mbmi->interp_filter = EIGHTTAP_SMOOTH; - } else { - if (tmp_rdcost1 < tmp_rdcost3) - mbmi->interp_filter = EIGHTTAP; - else - mbmi->interp_filter = EIGHTTAP_SMOOTH; + unsigned int pf_var[3]; + unsigned int pf_sse[3]; + TX_SIZE pf_tx_size[3]; + int64_t best_cost = INT64_MAX; + INTERP_FILTER best_filter = SWITCHABLE, filter; + PRED_BUFFER *current_pred = this_mode_pred; + + for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) { + int64_t cost; + mbmi->interp_filter = filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], + &pf_dist[filter], &pf_var[filter], &pf_sse[filter]); + cost = RDCOST(x->rdmult, x->rddiv, + vp9_get_switchable_rate(cpi) + pf_rate[filter], + pf_dist[filter]); + pf_tx_size[filter] = mbmi->tx_size; + if (cost < best_cost) { + best_filter = filter; + best_cost = cost; + skip_txfm = x->skip_txfm[0]; + + if (cpi->sf.reuse_inter_pred_sby) { + if (this_mode_pred != current_pred) { + free_pred_buffer(this_mode_pred); + this_mode_pred = current_pred; + } + + if (filter < EIGHTTAP_SHARP) { + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } } + if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mbmi->interp_filter = best_filter; + mbmi->tx_size = pf_tx_size[mbmi->interp_filter]; rate = pf_rate[mbmi->interp_filter]; dist = pf_dist[mbmi->interp_filter]; + var_y = pf_var[mbmi->interp_filter]; + sse_y = pf_sse[mbmi->interp_filter]; + x->skip_txfm[0] = skip_txfm; } else { mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); } rate += rate_mv; - rate += x->inter_mode_cost[mbmi->mode_context[ref_frame]] + rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] [INTER_OFFSET(this_mode)]; this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); - if (this_rd < best_rd) { + // Skipping checking: test to see if this block can be reconstructed by + // prediction only. + if (cpi->allow_encode_breakout) { + encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, + this_mode, var_y, sse_y, yv12_mb, &rate, &dist); + if (x->skip) { + rate += rate_mv; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + } + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, + this_mode, ctx); + } +#endif + + if (this_rd < best_rd || x->skip) { best_rd = this_rd; *returnrate = rate; *returndistortion = dist; best_mode = this_mode; best_pred_filter = mbmi->interp_filter; + best_tx_size = mbmi->tx_size; best_ref_frame = ref_frame; + skip_txfm = x->skip_txfm[0]; + + if (cpi->sf.reuse_inter_pred_sby) { + if (best_pred != NULL) + free_pred_buffer(best_pred); + + best_pred = this_mode_pred; + } + } else { + if (cpi->sf.reuse_inter_pred_sby) + free_pred_buffer(this_mode_pred); } + + if (x->skip) + break; } + // If the current reference frame is valid and we found a usable mode, + // we are done. + if (best_rd < INT64_MAX) + break; + } + + // If best prediction is not in dst buf, then copy the prediction block from + // temp buf to dst buf. + if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) { + uint8_t *copy_from, *copy_to; + + pd->dst = orig_dst; + copy_to = pd->dst.buf; + + copy_from = best_pred->data; + + vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0, + bw, bh); } - mbmi->mode = best_mode; + mbmi->mode = best_mode; mbmi->interp_filter = best_pred_filter; - mbmi->ref_frame[0] = best_ref_frame; - mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + mbmi->tx_size = best_tx_size; + mbmi->ref_frame[0] = best_ref_frame; + mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; + x->skip_txfm[0] = skip_txfm; // Perform intra prediction search, if the best SAD is above a certain // threshold. - if (best_rd > inter_mode_thresh) { + if (!x->skip && best_rd > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize) { + int i, j; + const int width = num_4x4_blocks_wide_lookup[bsize]; + const int height = num_4x4_blocks_high_lookup[bsize]; + + int rate2 = 0; + int64_t dist2 = 0; + const int dst_stride = cpi->sf.reuse_inter_pred_sby ? bw : pd->dst.stride; + const int src_stride = p->src.stride; + int block_idx = 0; + + TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size]; + const int step = 1 << tmp_tx_size; + + if (cpi->sf.reuse_inter_pred_sby) { + pd->dst.buf = tmp[0].data; + pd->dst.stride = bw; + } + for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) { - vp9_predict_intra_block(xd, 0, b_width_log2(bsize), - mbmi->tx_size, this_mode, - &p->src.buf[0], p->src.stride, - &pd->dst.buf[0], pd->dst.stride, 0, 0, 0); + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + for (j = 0; j < height; j += step) { + for (i = 0; i < width; i += step) { + p->src.buf = &src_buf_base[4 * (j * src_stride + i)]; + pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)]; + // Use source buffer as an approximation for the fully reconstructed + // buffer + vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), + tmp_tx_size, this_mode, + p->src.buf, src_stride, + pd->dst.buf, dst_stride, + i, j, 0); + model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y); + rate2 += rate; + dist2 += dist; + ++block_idx; + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; + + rate = rate2; + dist = dist2; - model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist); - rate += x->mbmode_cost[this_mode]; + rate += cpi->mbmode_cost[this_mode]; rate += intra_cost_penalty; this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + if (cpi->sf.reuse_inter_pred_sby) + pd->dst = orig_dst; + if (this_rd + intra_mode_cost < best_rd) { best_rd = this_rd; *returnrate = rate; *returndistortion = dist; mbmi->mode = this_mode; + mbmi->tx_size = tmp_tx_size; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->uv_mode = this_mode; mbmi->mv[0].as_int = INVALID_MV; + } else { + x->skip_txfm[0] = skip_txfm; } } } diff --git a/libvpx/vp9/encoder/vp9_pickmode.h b/libvpx/vp9/encoder/vp9_pickmode.h index 05ff18762..49c6feb88 100644 --- a/libvpx/vp9/encoder/vp9_pickmode.h +++ b/libvpx/vp9/encoder/vp9_pickmode.h @@ -11,18 +11,25 @@ #ifndef VP9_ENCODER_VP9_PICKMODE_H_ #define VP9_ENCODER_VP9_PICKMODE_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #ifdef __cplusplus extern "C" { #endif +typedef struct { + uint8_t *data; + int stride; + int in_use; +} PRED_BUFFER; + int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, const struct TileInfo *const tile, int mi_row, int mi_col, int *returnrate, int64_t *returndistortion, - BLOCK_SIZE bsize); + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); #ifdef __cplusplus } // extern "C" diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index c092ee41f..eababdbca 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -15,9 +15,136 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + if (!skip_block) { + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + if (!skip_block) { + + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +// TODO(jingning) Refactor this file and combine functions with similar +// operations. +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + + if (!skip_block) { + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, @@ -32,6 +159,7 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, zbin_ptr[1] + zbin_oq_value }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); @@ -87,6 +215,7 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int idx = 0; int idx_arr[1024]; int i, eob = -1; + (void)iscan; vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); @@ -161,10 +290,16 @@ void vp9_init_quantizer(VP9_COMP *cpi) { const int qrounding_factor = q == 0 ? 64 : 48; for (i = 0; i < 2; ++i) { + int qrounding_factor_fp = i == 0 ? 48 : 42; + if (q == 0) + qrounding_factor_fp = 64; + // y quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) : vp9_ac_quant(q, 0); invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); + quants->y_quant_fp[q][i] = (1 << 16) / quant; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->y_round[q][i] = (qrounding_factor * quant) >> 7; cm->y_dequant[q][i] = quant; @@ -174,41 +309,29 @@ void vp9_init_quantizer(VP9_COMP *cpi) { : vp9_ac_quant(q, cm->uv_ac_delta_q); invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], quant); + quants->uv_quant_fp[q][i] = (1 << 16) / quant; + quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; cm->uv_dequant[q][i] = quant; - -#if CONFIG_ALPHA - // alpha - quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q) - : vp9_ac_quant(q, cm->a_ac_delta_q); - invert_quant(&quants->a_quant[q][i], &quants->a_quant_shift[q][i], quant); - quants->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - quants->a_round[q][i] = (qrounding_factor * quant) >> 7; - cm->a_dequant[q][i] = quant; -#endif } for (i = 2; i < 8; i++) { quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; quants->y_zbin[q][i] = quants->y_zbin[q][1]; quants->y_round[q][i] = quants->y_round[q][1]; cm->y_dequant[q][i] = cm->y_dequant[q][1]; quants->uv_quant[q][i] = quants->uv_quant[q][1]; + quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; + quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1]; quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; quants->uv_round[q][i] = quants->uv_round[q][1]; cm->uv_dequant[q][i] = cm->uv_dequant[q][1]; - -#if CONFIG_ALPHA - quants->a_quant[q][i] = quants->a_quant[q][1]; - quants->a_quant_shift[q][i] = quants->a_quant_shift[q][1]; - quants->a_zbin[q][i] = quants->a_zbin[q][1]; - quants->a_round[q][i] = quants->a_round[q][1]; - cm->a_dequant[q][i] = cm->a_dequant[q][1]; -#endif } } } @@ -220,36 +343,39 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { const int segment_id = xd->mi[0]->mbmi.segment_id; const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); - const int zbin = cpi->zbin_mode_boost + x->act_zbin_adj; + const int zbin = cpi->zbin_mode_boost; int i; // Y x->plane[0].quant = quants->y_quant[qindex]; + x->plane[0].quant_fp = quants->y_quant_fp[qindex]; + x->plane[0].round_fp = quants->y_round_fp[qindex]; x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; + x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] * + cm->y_dequant[qindex][0]; + x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] * + cm->y_dequant[qindex][1]; x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7); xd->plane[0].dequant = cm->y_dequant[qindex]; // UV for (i = 1; i < 3; i++) { x->plane[i].quant = quants->uv_quant[qindex]; + x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; + x->plane[i].round_fp = quants->uv_round_fp[qindex]; x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; + x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] * + cm->y_dequant[qindex][0]; + x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] * + cm->y_dequant[qindex][1]; x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7); xd->plane[i].dequant = cm->uv_dequant[qindex]; } -#if CONFIG_ALPHA - x->plane[3].quant = cpi->a_quant[qindex]; - x->plane[3].quant_shift = cpi->a_quant_shift[qindex]; - x->plane[3].zbin = cpi->a_zbin[qindex]; - x->plane[3].round = cpi->a_round[qindex]; - x->plane[3].zbin_extra = (int16_t)((cm->a_dequant[qindex][1] * zbin) >> 7); - xd->plane[3].dequant = cm->a_dequant[qindex]; -#endif - x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->q_index = qindex; @@ -262,9 +388,9 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { const int qindex = x->q_index; const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; + cpi->zbin_mode_boost) >> 7; const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; + cpi->zbin_mode_boost) >> 7; x->plane[0].zbin_extra = (int16_t)y_zbin_extra; x->plane[1].zbin_extra = (int16_t)uv_zbin_extra; @@ -284,3 +410,30 @@ void vp9_set_quantizer(VP9_COMMON *cm, int q) { cm->uv_dc_delta_q = 0; cm->uv_ac_delta_q = 0; } + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int vp9_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int vp9_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) + return quantizer; + + return 63; +} diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h index 7d231dfd3..262529b05 100644 --- a/libvpx/vp9/encoder/vp9_quantize.h +++ b/libvpx/vp9/encoder/vp9_quantize.h @@ -11,6 +11,7 @@ #ifndef VP9_ENCODER_VP9_QUANTIZE_H_ #define VP9_ENCODER_VP9_QUANTIZE_H_ +#include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" #ifdef __cplusplus @@ -23,19 +24,27 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); - -#if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]); -#endif } QUANTS; +void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan); @@ -52,6 +61,10 @@ void vp9_init_quantizer(struct VP9_COMP *cpi); void vp9_set_quantizer(struct VP9Common *cm, int q); +int vp9_quantizer_to_qindex(int quantizer); + +int vp9_qindex_to_quantizer(int qindex); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index 342081644..290567ef1 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -27,6 +27,11 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_ratectrl.h" +// Max rate target for 1080P and below encodes under normal circumstances +// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 2025000 + #define DEFAULT_KF_BOOST 2000 #define DEFAULT_GF_BOOST 2000 @@ -35,14 +40,15 @@ #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 +#define FRAME_OVERHEAD_BITS 200 + // Tables relating active max Q to active min Q static int kf_low_motion_minq[QINDEX_RANGE]; static int kf_high_motion_minq[QINDEX_RANGE]; -static int gf_low_motion_minq[QINDEX_RANGE]; -static int gf_high_motion_minq[QINDEX_RANGE]; +static int arfgf_low_motion_minq[QINDEX_RANGE]; +static int arfgf_high_motion_minq[QINDEX_RANGE]; static int inter_minq[QINDEX_RANGE]; -static int afq_low_motion_minq[QINDEX_RANGE]; -static int afq_high_motion_minq[QINDEX_RANGE]; +static int rtc_minq[QINDEX_RANGE]; static int gf_high = 2000; static int gf_low = 400; static int kf_high = 5000; @@ -74,14 +80,12 @@ void vp9_rc_init_minq_luts() { for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = vp9_convert_qindex_to_q(i); - - kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.15); + kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125); kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50); - gf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.32); - gf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50); - afq_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.33); - afq_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55); - inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.75); + arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30); + arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50); + inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90); + rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70); } } @@ -100,7 +104,7 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, // q based adjustment to baseline enumerator enumerator += (int)(enumerator * q) >> 12; - return (int)(0.5 + (enumerator * correction_factor / q)); + return (int)(enumerator * correction_factor / q); } static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, @@ -112,7 +116,7 @@ static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const int min_frame_target = MAX(rc->min_frame_bandwidth, - rc->av_per_frame_bandwidth >> 5); + rc->avg_frame_bandwidth >> 5); if (target < min_frame_target) target = min_frame_target; if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { @@ -130,10 +134,10 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; - const VP9_CONFIG *oxcf = &cpi->oxcf; + const VP9EncoderConfig *oxcf = &cpi->oxcf; if (oxcf->rc_max_intra_bitrate_pct) { - const int max_rate = rc->av_per_frame_bandwidth * - oxcf->rc_max_intra_bitrate_pct / 100; + const int max_rate = rc->avg_frame_bandwidth * + oxcf->rc_max_intra_bitrate_pct / 100; target = MIN(target, max_rate); } if (target > rc->max_frame_bandwidth) @@ -155,7 +159,7 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { lrc->bits_off_target += bits_off_for_this_layer; // Clip buffer level to maximum buffer size for the layer. - lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size); + lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = lrc->bits_off_target; } } @@ -163,52 +167,50 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { // Update the buffer level: leaky bucket model. static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { const VP9_COMMON *const cm = &cpi->common; - const VP9_CONFIG *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; // Non-viewable frames are a special case and are treated as pure overhead. if (!cm->show_frame) { rc->bits_off_target -= encoded_frame_size; } else { - rc->bits_off_target += rc->av_per_frame_bandwidth - encoded_frame_size; + rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; } // Clip the buffer level to the maximum specified buffer size. - rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size); + rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); rc->buffer_level = rc->bits_off_target; - if (cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + if (cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) { update_layer_buffer_level(&cpi->svc, encoded_frame_size); } } -void vp9_rc_init(const VP9_CONFIG *oxcf, int pass, RATE_CONTROL *rc) { - if (pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) { - rc->avg_frame_qindex[0] = oxcf->worst_allowed_q; - rc->avg_frame_qindex[1] = oxcf->worst_allowed_q; - rc->avg_frame_qindex[2] = oxcf->worst_allowed_q; +void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { + int i; + + if (pass == 0 && oxcf->rc_mode == VPX_CBR) { + rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; } else { - rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; - rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; - rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; } - rc->last_q[0] = oxcf->best_allowed_q; - rc->last_q[1] = oxcf->best_allowed_q; - rc->last_q[2] = oxcf->best_allowed_q; + rc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + rc->last_q[INTER_FRAME] = oxcf->best_allowed_q; - rc->buffer_level = oxcf->starting_buffer_level; - rc->bits_off_target = oxcf->starting_buffer_level; + rc->buffer_level = rc->starting_buffer_level; + rc->bits_off_target = rc->starting_buffer_level; - rc->rolling_target_bits = rc->av_per_frame_bandwidth; - rc->rolling_actual_bits = rc->av_per_frame_bandwidth; - rc->long_rolling_target_bits = rc->av_per_frame_bandwidth; - rc->long_rolling_actual_bits = rc->av_per_frame_bandwidth; + rc->rolling_target_bits = rc->avg_frame_bandwidth; + rc->rolling_actual_bits = rc->avg_frame_bandwidth; + rc->long_rolling_target_bits = rc->avg_frame_bandwidth; + rc->long_rolling_actual_bits = rc->avg_frame_bandwidth; rc->total_actual_bits = 0; + rc->total_target_bits = 0; rc->total_target_vs_actual = 0; rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; @@ -227,13 +229,13 @@ void vp9_rc_init(const VP9_CONFIG *oxcf, int pass, RATE_CONTROL *rc) { rc->tot_q = 0.0; rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q); - rc->rate_correction_factor = 1.0; - rc->key_frame_rate_correction_factor = 1.0; - rc->gf_rate_correction_factor = 1.0; + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + rc->rate_correction_factors[i] = 1.0; + } } int vp9_rc_drop_frame(VP9_COMP *cpi) { - const VP9_CONFIG *oxcf = &cpi->oxcf; + const VP9EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; if (!oxcf->drop_frames_water_mark) { @@ -246,7 +248,7 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = (int)(oxcf->drop_frames_water_mark * - oxcf->optimal_buffer_level / 100); + rc->optimal_buffer_level / 100); if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { --rc->decimation_factor; @@ -271,28 +273,40 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { } static double get_rate_correction_factor(const VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + if (cpi->common.frame_type == KEY_FRAME) { - return cpi->rc.key_frame_rate_correction_factor; + return rc->rate_correction_factors[KF_STD]; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + return rc->rate_correction_factors[rf_lvl]; } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && - !cpi->rc.is_src_frame_alt_ref && - !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) - return cpi->rc.gf_rate_correction_factor; + !rc->is_src_frame_alt_ref && + !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR)) + return rc->rate_correction_factors[GF_ARF_STD]; else - return cpi->rc.rate_correction_factor; + return rc->rate_correction_factors[INTER_NORMAL]; } } static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { + RATE_CONTROL *const rc = &cpi->rc; + if (cpi->common.frame_type == KEY_FRAME) { - cpi->rc.key_frame_rate_correction_factor = factor; + rc->rate_correction_factors[KF_STD] = factor; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + rc->rate_correction_factors[rf_lvl] = factor; } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && - !cpi->rc.is_src_frame_alt_ref && - !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) - cpi->rc.gf_rate_correction_factor = factor; + !rc->is_src_frame_alt_ref && + !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR)) + rc->rate_correction_factors[GF_ARF_STD] = factor; else - cpi->rc.rate_correction_factor = factor; + rc->rate_correction_factors[INTER_NORMAL] = factor; } } @@ -304,6 +318,10 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { int projected_size_based_on_q = 0; + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) + return; + // Clear down mmx registers to allow floating point in what follows vp9_clear_system_state(); @@ -367,8 +385,8 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, // Calculate required scaling factor based on target frame size and size of // frame produced using previous Q. - target_bits_per_mb = - ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs; + target_bits_per_mb = + ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs; i = active_best_quality; @@ -406,6 +424,16 @@ static int get_active_quality(int q, int gfu_boost, int low, int high, } } +static int get_kf_active_quality(const RATE_CONTROL *const rc, int q) { + return get_active_quality(q, rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} + +static int get_gf_active_quality(const RATE_CONTROL *const rc, int q) { + return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; const unsigned int curr_frame = cpi->common.current_video_frame; @@ -424,7 +452,6 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { : rc->last_q[INTER_FRAME] * 2; } } - return MIN(active_worst_quality, rc->worst_quality); } @@ -436,10 +463,9 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { // ambient Q (at buffer = optimal level) to worst_quality level // (at buffer = critical level). const VP9_COMMON *const cm = &cpi->common; - const VP9_CONFIG *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; // Buffer level below which we push active_worst to worst_quality. - int64_t critical_level = oxcf->optimal_buffer_level >> 2; + int64_t critical_level = rc->optimal_buffer_level >> 2; int64_t buff_lvl_step = 0; int adjustment = 0; int active_worst_quality; @@ -451,26 +477,26 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { else active_worst_quality = MIN(rc->worst_quality, rc->avg_frame_qindex[KEY_FRAME] * 3 / 2); - if (rc->buffer_level > oxcf->optimal_buffer_level) { + if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. // Maximum limit for down adjustment, ~30%. int max_adjustment_down = active_worst_quality / 3; if (max_adjustment_down) { - buff_lvl_step = ((oxcf->maximum_buffer_size - - oxcf->optimal_buffer_level) / max_adjustment_down); + buff_lvl_step = ((rc->maximum_buffer_size - + rc->optimal_buffer_level) / max_adjustment_down); if (buff_lvl_step) - adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) / + adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / buff_lvl_step); active_worst_quality -= adjustment; } } else if (rc->buffer_level > critical_level) { // Adjust up from ambient Q. if (critical_level) { - buff_lvl_step = (oxcf->optimal_buffer_level - critical_level); + buff_lvl_step = (rc->optimal_buffer_level - critical_level); if (buff_lvl_step) { adjustment = (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) * - (oxcf->optimal_buffer_level - rc->buffer_level) / + (rc->optimal_buffer_level - rc->buffer_level) / buff_lvl_step); } active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment; @@ -507,11 +533,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, double q_adj_factor = 1.0; double q_val; - active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME], - rc->kf_boost, - kf_low, kf_high, - kf_low_motion_minq, - kf_high_motion_minq); + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -536,21 +559,19 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, } else { q = active_worst_quality; } - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - gf_low_motion_minq, gf_high_motion_minq); + active_best_quality = get_gf_active_quality(rc, q); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) - active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]]; + active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]]; else - active_best_quality = inter_minq[active_worst_quality]; + active_best_quality = rtc_minq[active_worst_quality]; } else { if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality) - active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; + active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]]; else - active_best_quality = inter_minq[active_worst_quality]; + active_best_quality = rtc_minq[active_worst_quality]; } } @@ -565,11 +586,18 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) { - if (!(cm->current_video_frame == 0)) - *top_index = (active_worst_quality + active_best_quality * 3) / 4; + if (cm->frame_type == KEY_FRAME && + !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + int qdelta = 0; + vp9_clear_system_state(); + qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 2.0); + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } #endif + // Special case code to try and match quality with forced key frames if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { q = rc->last_boosted_qindex; @@ -592,20 +620,35 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, return q; } +static int get_active_cq_level(const RATE_CONTROL *rc, + const VP9EncoderConfig *const oxcf) { + static const double cq_adjust_threshold = 0.5; + int active_cq_level = oxcf->cq_level; + if (oxcf->rc_mode == VPX_CQ && + rc->total_target_bits > 0) { + const double x = (double)rc->total_actual_bits / rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, int *bottom_index, int *top_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level(rc, oxcf); int active_best_quality; int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); int q; if (frame_is_intra_only(cm)) { active_best_quality = rc->best_quality; -#if !CONFIG_MULTIPLE_ARF - // Handle the special case for key frames forced when we have75 reached + + // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. if (rc->this_key_frame_forced) { @@ -614,16 +657,13 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, last_boosted_q * 0.75); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); - } else if (cm->current_video_frame > 0) { + } else { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; - active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME], - rc->kf_boost, - kf_low, kf_high, - kf_low_motion_minq, - kf_high_motion_minq); + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -636,13 +676,6 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, active_best_quality += vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor); } -#else - double current_q; - // Force the KF quantizer to be 30% of the active_worst_quality. - current_q = vp9_convert_qindex_to_q(active_worst_quality); - active_best_quality = active_worst_quality - + vp9_compute_qdelta(rc, current_q, current_q * 0.3); -#endif } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { // Use the lower of active_worst_quality and recent @@ -655,45 +688,27 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, q = rc->avg_frame_qindex[KEY_FRAME]; } // For constrained quality dont allow Q less than the cq level - if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) { - if (q < cpi->cq_target_quality) - q = cpi->cq_target_quality; - if (rc->frames_since_key > 1) { - active_best_quality = get_active_quality(q, rc->gfu_boost, - gf_low, gf_high, - afq_low_motion_minq, - afq_high_motion_minq); - } else { - active_best_quality = get_active_quality(q, rc->gfu_boost, - gf_low, gf_high, - gf_low_motion_minq, - gf_high_motion_minq); - } + if (oxcf->rc_mode == VPX_CQ) { + if (q < cq_level) + q = cq_level; + + active_best_quality = get_gf_active_quality(rc, q); + // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; - } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + } else if (oxcf->rc_mode == VPX_Q) { if (!cpi->refresh_alt_ref_frame) { - active_best_quality = cpi->cq_target_quality; + active_best_quality = cq_level; } else { - if (rc->frames_since_key > 1) { - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - afq_low_motion_minq, afq_high_motion_minq); - } else { - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - gf_low_motion_minq, gf_high_motion_minq); - } + active_best_quality = get_gf_active_quality(rc, q); } } else { - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - gf_low_motion_minq, gf_high_motion_minq); + active_best_quality = get_gf_active_quality(rc, q); } } else { - if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { - active_best_quality = cpi->cq_target_quality; + if (oxcf->rc_mode == VPX_Q) { + active_best_quality = cq_level; } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) @@ -702,15 +717,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; // For the constrained quality mode we don't want // q to fall below the cq level. - if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) && - (active_best_quality < cpi->cq_target_quality)) { - // If we are strongly undershooting the target rate in the last - // frames then use the user passed in cq value not the auto - // cq value. - if (rc->rolling_actual_bits < rc->min_frame_bandwidth) - active_best_quality = oxcf->cq_level; - else - active_best_quality = cpi->cq_target_quality; + if ((oxcf->rc_mode == VPX_CQ) && + (active_best_quality < cq_level)) { + active_best_quality = cq_level; } } } @@ -725,16 +734,27 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, *bottom_index = active_best_quality; #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY - // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) { - if (!(cm->current_video_frame == 0)) - *top_index = (active_worst_quality + active_best_quality * 3) / 4; - } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { - *top_index = (active_worst_quality + active_best_quality) / 2; + { + int qdelta = 0; + vp9_clear_system_state(); + + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && + !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 2.0); + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 1.75); + } + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } #endif - if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + + if (oxcf->rc_mode == VPX_Q) { q = active_best_quality; // Special case code to try and match quality with forced key frames } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { @@ -750,23 +770,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, q = *top_index; } } -#if CONFIG_MULTIPLE_ARF - // Force the quantizer determined by the coding order pattern. - if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && - cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) { - double new_q; - double current_q = vp9_convert_qindex_to_q(active_worst_quality); - int level = cpi->this_frame_weight; - assert(level >= 0); - new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level))); - q = active_worst_quality + - vp9_compute_qdelta(rc, current_q, new_q); - - *bottom_index = q; - *top_index = q; - printf("frame:%d q:%d\n", cm->current_video_frame, q); - } -#endif + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && @@ -780,13 +784,13 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *top_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level(rc, oxcf); int active_best_quality; int active_worst_quality = cpi->twopass.active_worst_quality; int q; - if (frame_is_intra_only(cm)) { -#if !CONFIG_MULTIPLE_ARF + if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) { // Handle the special case for key frames forced when we have75 reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. @@ -801,11 +805,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, double q_adj_factor = 1.0; double q_val; // Baseline value derived from cpi->active_worst_quality and kf boost. - active_best_quality = get_active_quality(active_worst_quality, - rc->kf_boost, - kf_low, kf_high, - kf_low_motion_minq, - kf_high_motion_minq); + active_best_quality = get_kf_active_quality(rc, active_worst_quality); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -821,13 +821,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, active_best_quality += vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor); } -#else - double current_q; - // Force the KF quantizer to be 30% of the active_worst_quality. - current_q = vp9_convert_qindex_to_q(active_worst_quality); - active_best_quality = active_worst_quality - + vp9_compute_qdelta(rc, current_q, current_q * 0.3); -#endif } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { // Use the lower of active_worst_quality and recent @@ -840,59 +833,35 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, q = active_worst_quality; } // For constrained quality dont allow Q less than the cq level - if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) { - if (q < cpi->cq_target_quality) - q = cpi->cq_target_quality; - if (rc->frames_since_key > 1) { - active_best_quality = get_active_quality(q, rc->gfu_boost, - gf_low, gf_high, - afq_low_motion_minq, - afq_high_motion_minq); - } else { - active_best_quality = get_active_quality(q, rc->gfu_boost, - gf_low, gf_high, - gf_low_motion_minq, - gf_high_motion_minq); - } + if (oxcf->rc_mode == VPX_CQ) { + if (q < cq_level) + q = cq_level; + + active_best_quality = get_gf_active_quality(rc, q); + // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; - } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + } else if (oxcf->rc_mode == VPX_Q) { if (!cpi->refresh_alt_ref_frame) { - active_best_quality = cpi->cq_target_quality; + active_best_quality = cq_level; } else { - if (rc->frames_since_key > 1) { - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - afq_low_motion_minq, afq_high_motion_minq); - } else { - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - gf_low_motion_minq, gf_high_motion_minq); - } + active_best_quality = get_gf_active_quality(rc, q); } } else { - active_best_quality = get_active_quality( - q, rc->gfu_boost, gf_low, gf_high, - gf_low_motion_minq, gf_high_motion_minq); + active_best_quality = get_gf_active_quality(rc, q); } } else { - if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { - active_best_quality = cpi->cq_target_quality; + if (oxcf->rc_mode == VPX_Q) { + active_best_quality = cq_level; } else { active_best_quality = inter_minq[active_worst_quality]; // For the constrained quality mode we don't want // q to fall below the cq level. - if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) && - (active_best_quality < cpi->cq_target_quality)) { - // If we are strongly undershooting the target rate in the last - // frames then use the user passed in cq value not the auto - // cq value. - if (rc->rolling_actual_bits < rc->min_frame_bandwidth) - active_best_quality = oxcf->cq_level; - else - active_best_quality = cpi->cq_target_quality; + if ((oxcf->rc_mode == VPX_CQ) && + (active_best_quality < cq_level)) { + active_best_quality = cq_level; } } } @@ -907,17 +876,26 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, *bottom_index = active_best_quality; #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY - // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) { - *top_index = (active_worst_quality + active_best_quality * 3) / 4; - } else if (!rc->is_src_frame_alt_ref && - (oxcf->end_usage != USAGE_STREAM_FROM_SERVER) && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { - *top_index = (active_worst_quality + active_best_quality) / 2; + vp9_clear_system_state(); + { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL + 1.00, // INTER_HIGH + 1.50, // GF_ARF_LOW + 1.75, // GF_ARF_STD + 2.00, // KF_STD + }; + const double rate_factor = + rate_factor_deltas[gf_group->rf_level[gf_group->index]]; + int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, rate_factor); + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } #endif - if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + if (oxcf->rc_mode == VPX_Q) { q = active_best_quality; // Special case code to try and match quality with forced key frames. } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { @@ -933,23 +911,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, q = *top_index; } } -#if CONFIG_MULTIPLE_ARF - // Force the quantizer determined by the coding order pattern. - if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && - cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) { - double new_q; - double current_q = vp9_convert_qindex_to_q(active_worst_quality); - int level = cpi->this_frame_weight; - assert(level >= 0); - new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level))); - q = active_worst_quality + - vp9_compute_qdelta(rc, current_q, new_q); - - *bottom_index = q; - *top_index = q; - printf("frame:%d q:%d\n", cm->current_video_frame, q); - } -#endif + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && @@ -961,20 +923,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, int *top_index) { int q; - if (cpi->pass == 0) { - if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + if (cpi->oxcf.pass == 0) { + if (cpi->oxcf.rc_mode == VPX_CBR) q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); else q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); } else { q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); } - - // Q of 0 is disabled because we force tx size to be - // 16x16... if (cpi->sf.use_nonrd_pick_mode) { - if (q == 0) - q++; if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; @@ -987,31 +944,19 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, } void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, - int this_frame_target, + int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { - // Set-up bounds on acceptable frame size: - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + if (cpi->oxcf.rc_mode == VPX_Q) { *frame_under_shoot_limit = 0; *frame_over_shoot_limit = INT_MAX; } else { - int recode_tolerance = - (cpi->sf.recode_tolerance * this_frame_target) / 100; - - *frame_over_shoot_limit = this_frame_target + recode_tolerance; - *frame_under_shoot_limit = this_frame_target - recode_tolerance; - // For very small rate targets where the fractional adjustment // may be tiny make sure there is at least a minimum range. - *frame_over_shoot_limit += 200; - *frame_under_shoot_limit -= 200; - if (*frame_under_shoot_limit < 0) - *frame_under_shoot_limit = 0; - - // Clip to maximum allowed rate for a frame. - if (*frame_over_shoot_limit > cpi->rc.max_frame_bandwidth) { - *frame_over_shoot_limit = cpi->rc.max_frame_bandwidth; - } + const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100; + *frame_under_shoot_limit = MAX(frame_target - tolerance - 200, 0); + *frame_over_shoot_limit = MIN(frame_target + tolerance + 200, + cpi->rc.max_frame_bandwidth); } } @@ -1020,6 +965,7 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { RATE_CONTROL *const rc = &cpi->rc; rc->this_frame_target = target; + // Target rate per SB64 (including partial SB64s. rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) / (cm->width * cm->height); @@ -1030,11 +976,8 @@ static void update_alt_ref_frame_stats(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; rc->frames_since_golden = 0; -#if CONFIG_MULTIPLE_ARF - if (!cpi->multi_arf_enabled) -#endif - // Clear the alternate reference update pending flag. - rc->source_alt_ref_pending = 0; + // Mark the alt ref as done (setting to 0 means no further alt refs pending). + rc->source_alt_ref_pending = 0; // Set the alternate reference frame active flag rc->source_alt_ref_active = 1; @@ -1048,8 +991,13 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { // this frame refreshes means next frames don't unless specified by user rc->frames_since_golden = 0; - if (!rc->source_alt_ref_pending) + if (cpi->oxcf.pass == 2) { + if (!rc->source_alt_ref_pending && + cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD) + rc->source_alt_ref_active = 0; + } else if (!rc->source_alt_ref_pending) { rc->source_alt_ref_active = 0; + } // Decrement count down till next gf if (rc->frames_till_gf_update_due > 0) @@ -1065,41 +1013,39 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { } void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { - VP9_COMMON *const cm = &cpi->common; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + const int qindex = cm->base_qindex; - cm->last_frame_type = cm->frame_type; // Update rate control heuristics rc->projected_frame_size = (int)(bytes_used << 3); // Post encode loop adjustment of Q prediction. vp9_rc_update_rate_correction_factors( cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF || - oxcf->end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0); + oxcf->rc_mode == VPX_CBR) ? 2 : 0); // Keep a record of last Q and ambient average Q. if (cm->frame_type == KEY_FRAME) { - rc->last_q[KEY_FRAME] = cm->base_qindex; - rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO( - 3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2); - } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) && - !(cpi->use_svc && oxcf->end_usage == USAGE_STREAM_FROM_SERVER)) { - rc->last_q[2] = cm->base_qindex; - rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO( - 3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2); + rc->last_q[KEY_FRAME] = qindex; + rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); } else { - rc->last_q[INTER_FRAME] = cm->base_qindex; - rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO( - 3 * rc->avg_frame_qindex[INTER_FRAME] + cm->base_qindex, 2); - rc->ni_frames++; - rc->tot_q += vp9_convert_qindex_to_q(cm->base_qindex); - rc->avg_q = rc->tot_q / (double)rc->ni_frames; - - // Calculate the average Q for normal inter frames (not key or GFU frames). - rc->ni_tot_qi += cm->base_qindex; - rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + if (rc->is_src_frame_alt_ref || + !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) || + (cpi->use_svc && oxcf->rc_mode == VPX_CBR)) { + rc->last_q[INTER_FRAME] = qindex; + rc->avg_frame_qindex[INTER_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + rc->ni_frames++; + rc->tot_q += vp9_convert_qindex_to_q(qindex); + rc->avg_q = rc->tot_q / rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + } } // Keep record of last boosted (KF/KF/ARF) Q value. @@ -1107,11 +1053,11 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { // If all mbs in this group are skipped only update if the Q value is // better than that already stored. // This is used to help set quality in forced key frames to reduce popping - if ((cm->base_qindex < rc->last_boosted_qindex) || + if ((qindex < rc->last_boosted_qindex) || ((cpi->static_mb_pct < 100) && ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { - rc->last_boosted_qindex = cm->base_qindex; + rc->last_boosted_qindex = qindex; } update_buffer_level(cpi, rc->projected_frame_size); @@ -1131,11 +1077,11 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { // Actual bits spent rc->total_actual_bits += rc->projected_frame_size; - rc->total_target_bits += (cm->show_frame ? rc->av_per_frame_bandwidth : 0); + rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; - if (oxcf->play_alternate && cpi->refresh_alt_ref_frame && + if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME)) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); @@ -1159,10 +1105,6 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { cpi->rc.frames_to_key--; } -static int test_for_kf_one_pass(VP9_COMP *cpi) { - // Placeholder function for auto key frame - return 0; -} // Use this macro to turn on/off use of alt-refs in one-pass mode. #define USE_ALTREF_FOR_ONE_PASS 1 @@ -1173,12 +1115,12 @@ static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { #if USE_ALTREF_FOR_ONE_PASS target = (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ? - (rc->av_per_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / + (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / (rc->baseline_gf_interval + af_ratio - 1) : - (rc->av_per_frame_bandwidth * rc->baseline_gf_interval) / + (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / (rc->baseline_gf_interval + af_ratio - 1); #else - target = rc->av_per_frame_bandwidth; + target = rc->avg_frame_bandwidth; #endif return vp9_rc_clamp_pframe_target_size(cpi, target); } @@ -1186,7 +1128,7 @@ static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; - int target = rc->av_per_frame_bandwidth * kf_ratio; + const int target = rc->avg_frame_bandwidth * kf_ratio; return vp9_rc_clamp_iframe_target_size(cpi, target); } @@ -1194,15 +1136,16 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; + // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || - (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 || - (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { + (cpi->oxcf.auto_key && 0))) { cm->frame_type = KEY_FRAME; rc->this_key_frame_forced = cm->current_video_frame != 0 && rc->frames_to_key == 0; - rc->frames_to_key = cpi->key_frame_frequency; + rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; } else { @@ -1226,17 +1169,16 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { } static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { - const VP9_CONFIG *oxcf = &cpi->oxcf; + const VP9EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; const SVC *const svc = &cpi->svc; - const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level; - const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100; - int min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4, - FRAME_OVERHEAD_BITS); - int target = rc->av_per_frame_bandwidth; + const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; + int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target = rc->avg_frame_bandwidth; if (svc->number_temporal_layers > 1 && - oxcf->end_usage == USAGE_STREAM_FROM_SERVER) { - // Note that for layers, av_per_frame_bandwidth is the cumulative + oxcf->rc_mode == VPX_CBR) { + // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). int current_temporal_layer = svc->temporal_layer_id; @@ -1258,19 +1200,27 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const SVC *const svc = &cpi->svc; int target; - if (cpi->common.current_video_frame == 0) { - target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX) - ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2); + target = ((rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX : (int)(rc->starting_buffer_level / 2); } else { - const int initial_boost = 32; - int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16)); - if (rc->frames_since_key < cpi->output_framerate / 2) { + int kf_boost = 32; + double framerate = oxcf->framerate; + if (svc->number_temporal_layers > 1 && + oxcf->rc_mode == VPX_CBR) { + // Use the layer framerate for temporal layers CBR mode. + const LAYER_CONTEXT *lc = &svc->layer_context[svc->temporal_layer_id]; + framerate = lc->framerate; + } + kf_boost = MAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { kf_boost = (int)(kf_boost * rc->frames_since_key / - (cpi->output_framerate / 2)); + (framerate / 2)); } - target = ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4; + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; } return vp9_rc_clamp_iframe_target_size(cpi, target); } @@ -1278,19 +1228,39 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { void vp9_rc_get_svc_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - int target = rc->av_per_frame_bandwidth; + int target = rc->avg_frame_bandwidth; if ((cm->current_video_frame == 0) || - (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && (rc->frames_since_key % - cpi->key_frame_frequency == 0))) { + cpi->oxcf.key_freq == 0))) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + + if (is_spatial_svc(cpi)) { + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1; + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + } + + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { target = calc_iframe_target_size_one_pass_cbr(cpi); } } else { cm->frame_type = INTER_FRAME; - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + + if (is_spatial_svc(cpi)) { + LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = 0; + } else { + lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + if (lc->is_key_frame) + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + } + cpi->ref_frame_flags &= (~VP9_ALT_FLAG); + } + + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { target = calc_pframe_target_size_one_pass_cbr(cpi); } } @@ -1303,14 +1273,15 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; + // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. if ((cm->current_video_frame == 0 || - (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 || - (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { + (cpi->oxcf.auto_key && 0))) { cm->frame_type = KEY_FRAME; rc->this_key_frame_forced = cm->current_video_frame != 0 && rc->frames_to_key == 0; - rc->frames_to_key = cpi->key_frame_frequency; + rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; target = calc_iframe_target_size_one_pass_cbr(cpi); @@ -1366,3 +1337,50 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, return target_index - qindex; } + +void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi, + RATE_CONTROL *const rc) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + // Set Maximum gf/arf interval + rc->max_gf_interval = 16; + + // Extended interval for genuinely static scenes + rc->static_scene_max_gf_interval = oxcf->key_freq >> 1; + if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2)) + rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + + if (is_altref_enabled(cpi)) { + if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) + rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + } + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; +} + +void vp9_rc_update_framerate(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + + rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate); + rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth * + oxcf->two_pass_vbrmin_section / 100); + + rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // The baseline for this aligns with HW implementations that + // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits + // per 16x16 MB (averaged over a frame). However this limit is extended if + // a very high rate is given on the command line or the the rate cannnot + // be acheived because of a user specificed max q (e.g. when the user + // specifies lossless encode. + vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth * + oxcf->two_pass_vbrmax_section) / 100); + rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), + vbr_max_bits); + + vp9_rc_set_gf_max_interval(cpi, rc); +} diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h index 7693c2b13..456daf48d 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libvpx/vp9/encoder/vp9_ratectrl.h @@ -20,26 +20,33 @@ extern "C" { #endif -#define FRAME_OVERHEAD_BITS 200 - // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 +typedef enum { + INTER_NORMAL = 0, + INTER_HIGH = 1, + GF_ARF_LOW = 2, + GF_ARF_STD = 3, + KF_STD = 4, + RATE_FACTOR_LEVELS = 5 +} RATE_FACTOR_LEVEL; + typedef struct { // Rate targetting variables - int this_frame_target; + int base_frame_target; // A baseline frame target before adjustment + // for previous under or over shoot. + int this_frame_target; // Actual frame target after rc adjustment. int projected_frame_size; int sb64_target_rate; - int last_q[3]; // Separate values for Intra/Inter/ARF-GF + int last_q[FRAME_TYPES]; // Separate values for Intra/Inter int last_boosted_qindex; // Last boosted GF/KF/ARF q int gfu_boost; int last_boost; int kf_boost; - double rate_correction_factor; - double key_frame_rate_correction_factor; - double gf_rate_correction_factor; + double rate_correction_factors[RATE_FACTOR_LEVELS]; int frames_since_golden; int frames_till_gf_update_due; @@ -54,19 +61,20 @@ typedef struct { int source_alt_ref_active; int is_src_frame_alt_ref; - int av_per_frame_bandwidth; // Average frame size target for clip - int min_frame_bandwidth; // Minimum allocation used for any frame - int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. int ni_av_qi; int ni_tot_qi; int ni_frames; - int avg_frame_qindex[3]; // 0 - KEY, 1 - INTER, 2 - ARF/GF + int avg_frame_qindex[FRAME_TYPES]; double tot_q; double avg_q; int64_t buffer_level; int64_t bits_off_target; + int64_t vbr_bits_off_target; int decimation_factor; int decimation_count; @@ -83,13 +91,18 @@ typedef struct { int worst_quality; int best_quality; + + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; // int active_best_quality; } RATE_CONTROL; struct VP9_COMP; -struct VP9_CONFIG; +struct VP9EncoderConfig; -void vp9_rc_init(const struct VP9_CONFIG *oxcf, int pass, RATE_CONTROL *rc); +void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, + RATE_CONTROL *rc); double vp9_convert_qindex_to_q(int qindex); @@ -125,8 +138,7 @@ void vp9_rc_get_svc_params(struct VP9_COMP *cpi); // Post encode update of the rate control parameters based // on bytes used -void vp9_rc_postencode_update(struct VP9_COMP *cpi, - uint64_t bytes_used); +void vp9_rc_postencode_update(struct VP9_COMP *cpi, uint64_t bytes_used); // Post encode update of the rate control parameters for dropped frames void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); @@ -175,6 +187,11 @@ int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget); int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, int qindex, double rate_target_ratio); +void vp9_rc_update_framerate(struct VP9_COMP *cpi); + +void vp9_rc_set_gf_max_interval(const struct VP9_COMP *const cpi, + RATE_CONTROL *const rc); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c new file mode 100644 index 000000000..4fc3e9e08 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_rd.c @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <math.h> +#include <stdio.h> + +#include "./vp9_rtcd.h" + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_tokenize.h" +#include "vp9/encoder/vp9_variance.h" + +#define RD_THRESH_POW 1.25 +#define RD_MULT_EPB_RATIO 64 + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = { + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32 +}; + +static void fill_mode_costs(VP9_COMP *cpi) { + const FRAME_CONTEXT *const fc = &cpi->common.fc; + int i, j; + + for (i = 0; i < INTRA_MODES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], + vp9_intra_mode_tree); + + vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); + vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME], + vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree); + vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME], + fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + vp9_cost_tokens(cpi->switchable_interp_costs[i], + fc->switchable_interp_prob[i], vp9_switchable_interp_tree); +} + +static void fill_token_costs(vp9_coeff_cost *c, + vp9_coeff_probs_model (*p)[PLANE_TYPES]) { + int i, j, k, l; + TX_SIZE t; + for (t = TX_4X4; t <= TX_32X32; ++t) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + vp9_prob probs[ENTROPY_NODES]; + vp9_model_to_full_probs(p[t][i][j][k][l], probs); + vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, + vp9_coef_tree); + vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs, + vp9_coef_tree); + assert(c[t][i][j][k][0][l][EOB_TOKEN] == + c[t][i][j][k][1][l][EOB_TOKEN]); + } +} + +// Values are now correlated to quantizer. +static int sad_per_bit16lut[QINDEX_RANGE]; +static int sad_per_bit4lut[QINDEX_RANGE]; + +void vp9_init_me_luts() { + int i; + + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < QINDEX_RANGE; ++i) { + const double q = vp9_convert_qindex_to_q(i); + sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107); + sad_per_bit4lut[i] = (int)(0.063 * q + 2.742); + } +} + +static const int rd_boost_factor[16] = { + 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 +}; +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { +128, 144, 128, 128, 144 +}; + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + const int q = vp9_dc_quant(qindex, 0); + int rdmult = 88 * q * q / 24; + + if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; + const int boost_index = MIN(15, (cpi->rc.gfu_boost / 100)); + + rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + return rdmult; +} + +static int compute_rd_thresh_factor(int qindex) { + // TODO(debargha): Adjust the function below. + const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12); + return MAX(q, 8); +} + +void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { + cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; +} + +static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { + int i, bsize, segment_id; + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = + clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) + + cm->y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex); + + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + + if (bsize >= BLOCK_8X8) { + for (i = 0; i < MAX_MODES; ++i) + rd->threshes[segment_id][bsize][i] = + rd->thresh_mult[i] < thresh_max + ? rd->thresh_mult[i] * t / 4 + : INT_MAX; + } else { + for (i = 0; i < MAX_REFS; ++i) + rd->threshes[segment_id][bsize][i] = + rd->thresh_mult_sub8x8[i] < thresh_max + ? rd->thresh_mult_sub8x8[i] * t / 4 + : INT_MAX; + } + } + } +} + +void vp9_initialize_rd_consts(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + RD_OPT *const rd = &cpi->rd; + int i; + + vp9_clear_system_state(); + + rd->RDDIV = RDDIV_BITS; // In bits (to multiply D by 128). + rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + + x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO; + x->errorperbit += (x->errorperbit == 0); + + x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cm->frame_type != KEY_FRAME) ? 0 : 1; + + set_block_thresholds(cm, rd); + + if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) { + fill_token_costs(x->token_costs, cm->fc.coef_probs); + + for (i = 0; i < PARTITION_CONTEXTS; ++i) + vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i), + vp9_partition_tree); + } + + if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 || + cm->frame_type == KEY_FRAME) { + fill_mode_costs(cpi); + + if (!frame_is_intra_only(cm)) { + vp9_build_nmv_cost_table(x->nmvjointcost, + cm->allow_high_precision_mv ? x->nmvcost_hp + : x->nmvcost, + &cm->fc.nmvc, cm->allow_high_precision_mv); + + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + vp9_cost_tokens((int *)cpi->inter_mode_cost[i], + cm->fc.inter_mode_probs[i], vp9_inter_mode_tree); + } + } +} + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + // NOTE: The tables below must be of the same size. + + // The functions described below are sampled at the four most significant + // bits of x^2 + 8 / 256. + + // Normalized rate: + // This table models the rate for a Laplacian source with given variance + // when quantized with a uniform quantizer with given stepsize. The + // closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, + 4553, 4389, 4255, 4142, 4044, 3958, 3881, 3811, + 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, + 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, + 2589, 2501, 2423, 2353, 2290, 2232, 2179, 2130, + 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, + 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, + 1159, 1086, 1021, 963, 911, 864, 821, 781, + 745, 680, 623, 574, 530, 490, 455, 424, + 395, 345, 304, 269, 239, 213, 190, 171, + 154, 126, 104, 87, 73, 61, 52, 44, + 38, 28, 21, 16, 12, 10, 8, 6, + 5, 3, 2, 1, 1, 1, 0, 0, + }; + // Normalized distortion: + // This table models the normalized distortion for a Laplacian source + // with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance). + // Note the actual distortion is Dn * variance. + static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, + 3, 3, 4, 5, 5, 6, 7, 7, + 8, 9, 11, 12, 13, 15, 16, 17, + 18, 21, 24, 26, 29, 31, 34, 36, + 39, 44, 49, 54, 59, 64, 69, 73, + 78, 88, 97, 106, 115, 124, 133, 142, + 151, 167, 184, 200, 215, 231, 245, 260, + 274, 301, 327, 351, 375, 397, 418, 439, + 458, 495, 528, 559, 587, 613, 637, 659, + 680, 717, 749, 777, 801, 823, 842, 859, + 874, 899, 919, 936, 949, 960, 969, 977, + 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, + 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, + }; + static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 40, 48, 56, 64, 72, 80, 88, + 96, 112, 128, 144, 160, 176, 192, 208, + 224, 256, 288, 320, 352, 384, 416, 448, + 480, 544, 608, 672, 736, 800, 864, 928, + 992, 1120, 1248, 1376, 1504, 1632, 1760, 1888, + 2016, 2272, 2528, 2784, 3040, 3296, 3552, 3808, + 4064, 4576, 5088, 5600, 6112, 6624, 7136, 7648, + 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, + 32736, 36832, 40928, 45024, 49120, 53216, 57312, 61408, + 65504, 73696, 81888, 90080, 98272, 106464, 114656, 122848, + 131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728, + }; + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; + const uint64_t xsq_q10_64 = + ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var; + const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = (n * r_q10 + 2) >> 2; + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_context; + const ENTROPY_CONTEXT *const left = pd->left_context; + + int i; + switch (tx_size) { + case TX_4X4: + vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_8X8: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_16X16: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_32X32: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + default: + assert(0 && "Invalid transform size."); + break; + } +} + +void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, + int ref_frame, BLOCK_SIZE block_size) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int_mv this_mv; + int i; + int zero_seen = 0; + int best_index = 0; + int best_sad = INT_MAX; + int this_sad = INT_MAX; + int max_mv = 0; + + uint8_t *src_y_ptr = x->plane[0].src.buf; + uint8_t *ref_y_ptr; + int row_offset, col_offset; + int num_mv_refs = MAX_MV_REF_CANDIDATES + + (cpi->sf.adaptive_motion_search && + cpi->common.show_frame && + block_size < cpi->sf.max_partition_size); + + MV pred_mv[3]; + pred_mv[0] = mbmi->ref_mvs[ref_frame][0].as_mv; + pred_mv[1] = mbmi->ref_mvs[ref_frame][1].as_mv; + pred_mv[2] = x->pred_mv[ref_frame]; + + // Get the sad for each candidate reference mv. + for (i = 0; i < num_mv_refs; ++i) { + this_mv.as_mv = pred_mv[i]; + + max_mv = MAX(max_mv, + MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); + // Only need to check zero mv once. + if (!this_mv.as_int && zero_seen) + continue; + + zero_seen = zero_seen || !this_mv.as_int; + + row_offset = this_mv.as_mv.row >> 3; + col_offset = this_mv.as_mv.col >> 3; + ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset; + + // Find sad for current vector. + this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, + ref_y_ptr, ref_y_stride); + + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + best_index = i; + } + } + + // Note the index of the mv that worked best in the reference list. + x->mv_best_ref_index[ref_frame] = best_index; + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void vp9_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv) { + int i; + + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col, + i ? scale_uv : scale, + xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); + } +} + +const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, + int ref_frame) { + const VP9_COMMON *const cm = &cpi->common; + const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; + const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; + return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL; +} + +int vp9_get_switchable_rate(const VP9_COMP *cpi) { + const MACROBLOCKD *const xd = &cpi->mb.e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int ctx = vp9_get_pred_context_switchable_interp(xd); + return SWITCHABLE_INTERP_RATE_FACTOR * + cpi->switchable_interp_costs[ctx][mbmi->interp_filter]; +} + +void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) { + int i; + RD_OPT *const rd = &cpi->rd; + SPEED_FEATURES *const sf = &cpi->sf; + + // Set baseline threshold values. + for (i = 0; i < MAX_MODES; ++i) + rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0; + + rd->thresh_mult[THR_NEARESTMV] = 0; + rd->thresh_mult[THR_NEARESTG] = 0; + rd->thresh_mult[THR_NEARESTA] = 0; + + rd->thresh_mult[THR_DC] += 1000; + + rd->thresh_mult[THR_NEWMV] += 1000; + rd->thresh_mult[THR_NEWA] += 1000; + rd->thresh_mult[THR_NEWG] += 1000; + + // Adjust threshold only in real time mode, which only uses last + // reference frame. + rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh; + + rd->thresh_mult[THR_NEARMV] += 1000; + rd->thresh_mult[THR_NEARA] += 1000; + rd->thresh_mult[THR_COMP_NEARESTLA] += 1000; + rd->thresh_mult[THR_COMP_NEARESTGA] += 1000; + + rd->thresh_mult[THR_TM] += 1000; + + rd->thresh_mult[THR_COMP_NEARLA] += 1500; + rd->thresh_mult[THR_COMP_NEWLA] += 2000; + rd->thresh_mult[THR_NEARG] += 1000; + rd->thresh_mult[THR_COMP_NEARGA] += 1500; + rd->thresh_mult[THR_COMP_NEWGA] += 2000; + + rd->thresh_mult[THR_ZEROMV] += 2000; + rd->thresh_mult[THR_ZEROG] += 2000; + rd->thresh_mult[THR_ZEROA] += 2000; + rd->thresh_mult[THR_COMP_ZEROLA] += 2500; + rd->thresh_mult[THR_COMP_ZEROGA] += 2500; + + rd->thresh_mult[THR_H_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_D45_PRED ] += 2500; + rd->thresh_mult[THR_D135_PRED] += 2500; + rd->thresh_mult[THR_D117_PRED] += 2500; + rd->thresh_mult[THR_D153_PRED] += 2500; + rd->thresh_mult[THR_D207_PRED] += 2500; + rd->thresh_mult[THR_D63_PRED] += 2500; + + // Disable frame modes if flags not set. + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { + rd->thresh_mult[THR_NEWMV ] = INT_MAX; + rd->thresh_mult[THR_NEARESTMV] = INT_MAX; + rd->thresh_mult[THR_ZEROMV ] = INT_MAX; + rd->thresh_mult[THR_NEARMV ] = INT_MAX; + } + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + rd->thresh_mult[THR_NEARESTG ] = INT_MAX; + rd->thresh_mult[THR_ZEROG ] = INT_MAX; + rd->thresh_mult[THR_NEARG ] = INT_MAX; + rd->thresh_mult[THR_NEWG ] = INT_MAX; + } + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { + rd->thresh_mult[THR_NEARESTA ] = INT_MAX; + rd->thresh_mult[THR_ZEROA ] = INT_MAX; + rd->thresh_mult[THR_NEARA ] = INT_MAX; + rd->thresh_mult[THR_NEWA ] = INT_MAX; + } + + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) { + rd->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; + } + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { + rd->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; + } +} + +void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + RD_OPT *const rd = &cpi->rd; + int i; + + for (i = 0; i < MAX_REFS; ++i) + rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0; + + rd->thresh_mult_sub8x8[THR_LAST] += 2500; + rd->thresh_mult_sub8x8[THR_GOLD] += 2500; + rd->thresh_mult_sub8x8[THR_ALTR] += 2500; + rd->thresh_mult_sub8x8[THR_INTRA] += 2500; + rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500; + rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500; + + // Check for masked out split cases. + for (i = 0; i < MAX_REFS; ++i) + if (sf->disable_split_mask & (1 << i)) + rd->thresh_mult_sub8x8[i] = INT_MAX; + + // Disable mode test if frame flag is not set. + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) + rd->thresh_mult_sub8x8[THR_LAST] = INT_MAX; + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) + rd->thresh_mult_sub8x8[THR_GOLD] = INT_MAX; + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) + rd->thresh_mult_sub8x8[THR_ALTR] = INT_MAX; + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) + rd->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX; + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) + rd->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX; +} diff --git a/libvpx/vp9/encoder/vp9_rd.h b/libvpx/vp9/encoder/vp9_rd.h new file mode 100644 index 000000000..eeb5e0f84 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_rd.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_RD_H_ +#define VP9_ENCODER_VP9_RD_H_ + +#include <limits.h> + +#include "vp9/common/vp9_blockd.h" + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_context_tree.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 + +#define RDCOST(RM, DM, R, D) \ + (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +#define INVALID_MV 0x80008000 + +#define MAX_MODES 30 +#define MAX_REFS 6 + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. +typedef enum { + THR_NEARESTMV, + THR_NEARESTA, + THR_NEARESTG, + + THR_DC, + + THR_NEWMV, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARA, + THR_COMP_NEARESTLA, + THR_COMP_NEARESTGA, + + THR_TM, + + THR_COMP_NEARLA, + THR_COMP_NEWLA, + THR_NEARG, + THR_COMP_NEARGA, + THR_COMP_NEWGA, + + THR_ZEROMV, + THR_ZEROG, + THR_ZEROA, + THR_COMP_ZEROLA, + THR_COMP_ZEROGA, + + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D207_PRED, + THR_D153_PRED, + THR_D63_PRED, + THR_D117_PRED, + THR_D45_PRED, +} THR_MODES; + +typedef enum { + THR_LAST, + THR_GOLD, + THR_ALTR, + THR_COMP_LA, + THR_COMP_GA, + THR_INTRA, +} THR_MODES_SUB8X8; + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact + // to pick a threshold. + int thresh_mult[MAX_MODES]; + int thresh_mult_sub8x8[MAX_REFS]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; + int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; + + int64_t comp_pred_diff[REFERENCE_MODES]; + int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; + int64_t tx_select_diff[TX_MODES]; + // TODO(agrange): can this overflow? + int tx_select_threshes[MAX_REF_FRAMES][TX_MODES]; + + int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; + int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; + int64_t mask_filter; + + int RDMULT; + int RDDIV; +} RD_OPT; + +struct TileInfo; +struct VP9_COMP; +struct macroblock; + +int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); + +void vp9_initialize_rd_consts(struct VP9_COMP *cpi); + +void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex); + +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, + unsigned int qstep, int *rate, + int64_t *dist); + +int vp9_get_switchable_rate(const struct VP9_COMP *cpi); + +const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi, + int ref_frame); + +void vp9_init_me_luts(); + +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]); + +void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); + +void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); + +static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, + int thresh_fact) { + return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; +} + +void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, + int ref_frame, BLOCK_SIZE block_size); + +void vp9_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_RD_H_ diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index dcd28525a..e368037a6 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -9,9 +9,7 @@ */ #include <assert.h> -#include <limits.h> #include <math.h> -#include <stdio.h> #include "./vp9_rtcd.h" @@ -22,7 +20,6 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_mvref_common.h" -#include "vp9/common/vp9_pragmas.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" @@ -33,21 +30,16 @@ #include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_variance.h" #define RD_THRESH_MAX_FACT 64 #define RD_THRESH_INC 1 -#define RD_THRESH_POW 1.25 -#define RD_MULT_EPB_RATIO 64 - -/* Factor to weigh the rate for switchable interp filters */ -#define SWITCHABLE_INTERP_RATE_FACTOR 1 #define LAST_FRAME_MODE_MASK 0xFFEDCD60 #define GOLDEN_FRAME_MODE_MASK 0xFFDA3BB0 @@ -56,7 +48,7 @@ #define MIN_EARLY_TERM_INDEX 3 typedef struct { - MB_PREDICTION_MODE mode; + PREDICTION_MODE mode; MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; @@ -81,7 +73,7 @@ struct rdcost_block_args { const scan_order *so; }; -const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { +static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { {NEARESTMV, {LAST_FRAME, NONE}}, {NEARESTMV, {ALTREF_FRAME, NONE}}, {NEARESTMV, {GOLDEN_FRAME, NONE}}, @@ -121,7 +113,7 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { {D45_PRED, {INTRA_FRAME, NONE}}, }; -const REF_DEFINITION vp9_ref_order[MAX_REFS] = { +static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { {{LAST_FRAME, NONE}}, {{GOLDEN_FRAME, NONE}}, {{ALTREF_FRAME, NONE}}, @@ -130,13 +122,6 @@ const REF_DEFINITION vp9_ref_order[MAX_REFS] = { {{INTRA_FRAME, NONE}}, }; -// The baseline rd thresholds for breaking out of the rd loop for -// certain modes are assumed to be based on 8x8 blocks. -// This table is used to correct for blocks size. -// The factors here are << 2 (2 = x0.5, 32 = x8 etc). -static int rd_thresh_block_size_factor[BLOCK_SIZES] = - {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32}; - static int raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, int stride) { const int bw = b_width_log2(plane_bsize); @@ -150,276 +135,28 @@ static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, return base + raster_block_offset(plane_bsize, raster_block, stride); } -static void fill_mode_costs(VP9_COMP *cpi) { - MACROBLOCK *const x = &cpi->mb; - const FRAME_CONTEXT *const fc = &cpi->common.fc; - int i, j; - - for (i = 0; i < INTRA_MODES; i++) - for (j = 0; j < INTRA_MODES; j++) - vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], - vp9_intra_mode_tree); - - // TODO(rbultje) separate tables for superblock costing? - vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); - vp9_cost_tokens(x->intra_uv_mode_cost[KEY_FRAME], - vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree); - vp9_cost_tokens(x->intra_uv_mode_cost[INTER_FRAME], - fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree); - - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) - vp9_cost_tokens((int *)x->switchable_interp_costs[i], - fc->switchable_interp_prob[i], vp9_switchable_interp_tree); -} - -static void fill_token_costs(vp9_coeff_cost *c, - vp9_coeff_probs_model (*p)[PLANE_TYPES]) { - int i, j, k, l; - TX_SIZE t; - for (t = TX_4X4; t <= TX_32X32; ++t) - for (i = 0; i < PLANE_TYPES; ++i) - for (j = 0; j < REF_TYPES; ++j) - for (k = 0; k < COEF_BANDS; ++k) - for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { - vp9_prob probs[ENTROPY_NODES]; - vp9_model_to_full_probs(p[t][i][j][k][l], probs); - vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, - vp9_coef_tree); - vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs, - vp9_coef_tree); - assert(c[t][i][j][k][0][l][EOB_TOKEN] == - c[t][i][j][k][1][l][EOB_TOKEN]); - } -} - -static const int rd_iifactor[32] = { - 4, 4, 3, 2, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -// 3* dc_qlookup[Q]*dc_qlookup[Q]; - -/* values are now correlated to quantizer */ -static int sad_per_bit16lut[QINDEX_RANGE]; -static int sad_per_bit4lut[QINDEX_RANGE]; - -void vp9_init_me_luts() { - int i; - - // Initialize the sad lut tables using a formulaic calculation for now - // This is to make it easier to resolve the impact of experimental changes - // to the quantizer tables. - for (i = 0; i < QINDEX_RANGE; i++) { - const double q = vp9_convert_qindex_to_q(i); - sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107); - sad_per_bit4lut[i] = (int)(0.063 * q + 2.742); - } -} - -int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { - const int q = vp9_dc_quant(qindex, 0); - // TODO(debargha): Adjust the function below - int rdmult = 88 * q * q / 25; - if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { - if (cpi->twopass.next_iiratio > 31) - rdmult += (rdmult * rd_iifactor[31]) >> 4; - else - rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; - } - return rdmult; -} - -static int compute_rd_thresh_factor(int qindex) { - // TODO(debargha): Adjust the function below - const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12); - return MAX(q, 8); -} - -void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { - cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; - cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; -} - -static void set_block_thresholds(VP9_COMP *cpi) { - const VP9_COMMON *const cm = &cpi->common; - int i, bsize, segment_id; - - for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { - const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id, - cm->base_qindex) + cm->y_dc_delta_q, - 0, MAXQ); - const int q = compute_rd_thresh_factor(qindex); - - for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - // Threshold here seems unnecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[]. - const int t = q * rd_thresh_block_size_factor[bsize]; - const int thresh_max = INT_MAX / t; - - for (i = 0; i < MAX_MODES; ++i) - cpi->rd_threshes[segment_id][bsize][i] = - cpi->rd_thresh_mult[i] < thresh_max ? cpi->rd_thresh_mult[i] * t / 4 - : INT_MAX; - - for (i = 0; i < MAX_REFS; ++i) { - cpi->rd_thresh_sub8x8[segment_id][bsize][i] = - cpi->rd_thresh_mult_sub8x8[i] < thresh_max - ? cpi->rd_thresh_mult_sub8x8[i] * t / 4 - : INT_MAX; - } - } - } -} - -void vp9_initialize_rd_consts(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; +static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + int m, int n, int min_plane, int max_plane) { int i; - vp9_clear_system_state(); - - cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128) - cpi->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); - - x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO; - x->errorperbit += (x->errorperbit == 0); - - x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && - cm->frame_type != KEY_FRAME) ? 0 : 1; - - set_block_thresholds(cpi); - - if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) { - fill_token_costs(x->token_costs, cm->fc.coef_probs); - - for (i = 0; i < PARTITION_CONTEXTS; i++) - vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i), - vp9_partition_tree); - } - - if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 || - cm->frame_type == KEY_FRAME) { - fill_mode_costs(cpi); - - if (!frame_is_intra_only(cm)) { - vp9_build_nmv_cost_table(x->nmvjointcost, - cm->allow_high_precision_mv ? x->nmvcost_hp - : x->nmvcost, - &cm->fc.nmvc, cm->allow_high_precision_mv); + for (i = min_plane; i < max_plane; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &x->e_mbd.plane[i]; - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - vp9_cost_tokens((int *)x->inter_mode_cost[i], - cm->fc.inter_mode_probs[i], vp9_inter_mode_tree); - } - } -} + p->coeff = ctx->coeff_pbuf[i][m]; + p->qcoeff = ctx->qcoeff_pbuf[i][m]; + pd->dqcoeff = ctx->dqcoeff_pbuf[i][m]; + p->eobs = ctx->eobs_pbuf[i][m]; -static const int MAX_XSQ_Q10 = 245727; - -static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { - // NOTE: The tables below must be of the same size - - // The functions described below are sampled at the four most significant - // bits of x^2 + 8 / 256 - - // Normalized rate - // This table models the rate for a Laplacian source - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], - // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), - // and H(x) is the binary entropy function. - static const int rate_tab_q10[] = { - 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, - 4553, 4389, 4255, 4142, 4044, 3958, 3881, 3811, - 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, - 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, - 2589, 2501, 2423, 2353, 2290, 2232, 2179, 2130, - 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, - 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, - 1159, 1086, 1021, 963, 911, 864, 821, 781, - 745, 680, 623, 574, 530, 490, 455, 424, - 395, 345, 304, 269, 239, 213, 190, 171, - 154, 126, 104, 87, 73, 61, 52, 44, - 38, 28, 21, 16, 12, 10, 8, 6, - 5, 3, 2, 1, 1, 1, 0, 0, - }; - // Normalized distortion - // This table models the normalized distortion for a Laplacian source - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) - // where x = qpstep / sqrt(variance) - // Note the actual distortion is Dn * variance. - static const int dist_tab_q10[] = { - 0, 0, 1, 1, 1, 2, 2, 2, - 3, 3, 4, 5, 5, 6, 7, 7, - 8, 9, 11, 12, 13, 15, 16, 17, - 18, 21, 24, 26, 29, 31, 34, 36, - 39, 44, 49, 54, 59, 64, 69, 73, - 78, 88, 97, 106, 115, 124, 133, 142, - 151, 167, 184, 200, 215, 231, 245, 260, - 274, 301, 327, 351, 375, 397, 418, 439, - 458, 495, 528, 559, 587, 613, 637, 659, - 680, 717, 749, 777, 801, 823, 842, 859, - 874, 899, 919, 936, 949, 960, 969, 977, - 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, - 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, - }; - static const int xsq_iq_q10[] = { - 0, 4, 8, 12, 16, 20, 24, 28, - 32, 40, 48, 56, 64, 72, 80, 88, - 96, 112, 128, 144, 160, 176, 192, 208, - 224, 256, 288, 320, 352, 384, 416, 448, - 480, 544, 608, 672, 736, 800, 864, 928, - 992, 1120, 1248, 1376, 1504, 1632, 1760, 1888, - 2016, 2272, 2528, 2784, 3040, 3296, 3552, 3808, - 4064, 4576, 5088, 5600, 6112, 6624, 7136, 7648, - 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, - 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, - 32736, 36832, 40928, 45024, 49120, 53216, 57312, 61408, - 65504, 73696, 81888, 90080, 98272, 106464, 114656, 122848, - 131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728, - }; - /* - static const int tab_size = sizeof(rate_tab_q10) / sizeof(rate_tab_q10[0]); - assert(sizeof(dist_tab_q10) / sizeof(dist_tab_q10[0]) == tab_size); - assert(sizeof(xsq_iq_q10) / sizeof(xsq_iq_q10[0]) == tab_size); - assert(MAX_XSQ_Q10 + 1 == xsq_iq_q10[tab_size - 1]); - */ - int tmp = (xsq_q10 >> 2) + 8; - int k = get_msb(tmp) - 3; - int xq = (k << 3) + ((tmp >> k) & 0x7); - const int one_q10 = 1 << 10; - const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); - const int b_q10 = one_q10 - a_q10; - *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; - *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; -} + ctx->coeff_pbuf[i][m] = ctx->coeff_pbuf[i][n]; + ctx->qcoeff_pbuf[i][m] = ctx->qcoeff_pbuf[i][n]; + ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n]; + ctx->eobs_pbuf[i][m] = ctx->eobs_pbuf[i][n]; -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, - unsigned int qstep, int *rate, - int64_t *dist) { - // This function models the rate and distortion for a Laplacian - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expressions are in: - // Hang and Chen, "Source Model for transform video coder and its - // application - Part I: Fundamental Theory", IEEE Trans. Circ. - // Sys. for Video Tech., April 1997. - if (var == 0) { - *rate = 0; - *dist = 0; - } else { - int d_q10, r_q10; - const uint64_t xsq_q10_64 = - ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var; - const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? - MAX_XSQ_Q10 : (int)xsq_q10_64; - model_rd_norm(xsq_q10, &r_q10, &d_q10); - *rate = (n * r_q10 + 2) >> 2; - *dist = (var * (int64_t)d_q10 + 512) >> 10; + ctx->coeff_pbuf[i][n] = p->coeff; + ctx->qcoeff_pbuf[i][n] = p->qcoeff; + ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff; + ctx->eobs_pbuf[i][n] = p->eobs; } } @@ -434,20 +171,32 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, int64_t dist_sum = 0; const int ref = xd->mi[0]->mbmi.ref_frame[0]; unsigned int sse; + const int shift = 8; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); + const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, + &sse); + + if (!x->select_tx_size) { + if (sse < p->quant_thred[0] >> shift) + x->skip_txfm[i] = 1; + else if (var < p->quant_thred[1] >> shift) + x->skip_txfm[i] = 2; + else + x->skip_txfm[i] = 0; + } + x->bsse[i] = sse; if (i == 0) x->pred_sse[ref] = sse; // Fast approximate the modelling function. - if (cpi->speed > 4) { + if (cpi->oxcf.speed > 4) { int64_t rate; int64_t dist; int64_t square_error = sse; @@ -474,55 +223,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, *out_dist_sum = dist_sum << 4; } -static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize, - TX_SIZE tx_size, - MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int64_t *out_dist_sum, - int *out_skip) { - int j, k; - BLOCK_SIZE bs; - const struct macroblock_plane *const p = &x->plane[0]; - const struct macroblockd_plane *const pd = &xd->plane[0]; - const int width = 4 * num_4x4_blocks_wide_lookup[bsize]; - const int height = 4 * num_4x4_blocks_high_lookup[bsize]; - int rate_sum = 0; - int64_t dist_sum = 0; - const int t = 4 << tx_size; - - if (tx_size == TX_4X4) { - bs = BLOCK_4X4; - } else if (tx_size == TX_8X8) { - bs = BLOCK_8X8; - } else if (tx_size == TX_16X16) { - bs = BLOCK_16X16; - } else if (tx_size == TX_32X32) { - bs = BLOCK_32X32; - } else { - assert(0); - } - - *out_skip = 1; - for (j = 0; j < height; j += t) { - for (k = 0; k < width; k += t) { - int rate; - int64_t dist; - unsigned int sse; - cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride, - &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride, - &sse); - // sse works better than var, since there is no dc prediction used - vp9_model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, - &rate, &dist); - rate_sum += rate; - dist_sum += dist; - *out_skip &= (rate < 1024); - } - } - - *out_rate_sum = rate_sum; - *out_dist_sum = dist_sum << 4; -} - int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; @@ -570,7 +270,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x, int c, cost; // Check for consistency of tx_size with mode info assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size - : get_uv_tx_size(mbmi) == tx_size); + : get_uv_tx_size(mbmi, pd) == tx_size); if (eob == 0) { // single eob token @@ -669,12 +369,32 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (args->skip) return; - if (!is_inter_block(mbmi)) + if (!is_inter_block(mbmi)) { vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); - else - vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + dist_block(plane, block, tx_size, args); + } else { + if (x->skip_txfm[plane] == 0) { + // full forward transform and quantization + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + dist_block(plane, block, tx_size, args); + } else if (x->skip_txfm[plane] == 2) { + // compute DC coefficient + int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + args->sse = x->bsse[plane] << 4; + args->dist = args->sse; + if (!x->plane[plane].eobs[block]) + args->dist = args->sse - ((coeff[0] * coeff[0] - + (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2); + } else { + // skip forward transform + x->plane[plane].eobs[block] = 0; + args->sse = x->bsse[plane] << 4; + args->dist = args->sse; + } + } - dist_block(plane, block, tx_size, args); rate_block(plane, block, plane_bsize, tx_size, args); rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); @@ -696,45 +416,6 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, } } -void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, - const struct macroblockd_plane *pd, - ENTROPY_CONTEXT t_above[16], - ENTROPY_CONTEXT t_left[16]) { - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const ENTROPY_CONTEXT *const above = pd->above_context; - const ENTROPY_CONTEXT *const left = pd->left_context; - - int i; - switch (tx_size) { - case TX_4X4: - vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); - vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X8: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_16X16: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_32X32: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - default: - assert(0 && "Invalid transform size."); - } -} - static void txfm_rd_in_plane(MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, @@ -743,7 +424,8 @@ static void txfm_rd_in_plane(MACROBLOCK *x, int use_fast_coef_casting) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; - struct rdcost_block_args args = { 0 }; + struct rdcost_block_args args; + vp9_zero(args); args.x = x; args.best_rd = ref_best_rd; args.use_fast_coef_costing = use_fast_coef_casting; @@ -770,11 +452,11 @@ static void txfm_rd_in_plane(MACROBLOCK *x, } } -static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skip, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE bs) { +static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skip, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; @@ -784,27 +466,31 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, mbmi->tx_size = MIN(max_tx_size, largest_tx_size); txfm_rd_in_plane(x, rate, distortion, skip, - &sse[mbmi->tx_size], ref_best_rd, 0, bs, + sse, ref_best_rd, 0, bs, mbmi->tx_size, cpi->sf.use_fast_coef_costing); cpi->tx_stepdown_count[0]++; } -static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, - int64_t tx_cache[TX_MODES], - BLOCK_SIZE bs) { +static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, + int64_t *distortion, + int *skip, + int64_t *psse, + int64_t tx_cache[TX_MODES], + int64_t ref_best_rd, + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); + int r[TX_SIZES][2], s[TX_SIZES]; + int64_t d[TX_SIZES], sse[TX_SIZES]; int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, {INT64_MAX, INT64_MAX}, {INT64_MAX, INT64_MAX}, {INT64_MAX, INT64_MAX}}; - int n, m; + TX_SIZE n, m; int s0, s1; const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; int64_t best_rd = INT64_MAX; @@ -816,6 +502,9 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, s1 = vp9_cost_bit(skip_prob, 1); for (n = TX_4X4; n <= max_tx_size; n++) { + txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n], + &sse[n], ref_best_rd, 0, bs, n, + cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { for (m = 0; m <= n - (n == max_tx_size); m++) { @@ -846,6 +535,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, *distortion = d[mbmi->tx_size]; *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT]; *skip = s[mbmi->tx_size]; + *psse = sse[mbmi->tx_size]; tx_cache[ONLY_4X4] = rd[TX_4X4][0]; tx_cache[ALLOW_8X8] = rd[TX_8X8][0]; @@ -867,159 +557,49 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, } } -static int64_t scaled_rd_cost(int rdmult, int rddiv, - int rate, int64_t dist, double scale) { - return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale); -} - -static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE bs) { - const TX_SIZE max_tx_size = max_txsize_lookup[bs]; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); - int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, - {INT64_MAX, INT64_MAX}, - {INT64_MAX, INT64_MAX}, - {INT64_MAX, INT64_MAX}}; - int n, m; - int s0, s1; - double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; - const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; - int64_t best_rd = INT64_MAX; - TX_SIZE best_tx = TX_4X4; - - const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); - assert(skip_prob > 0); - s0 = vp9_cost_bit(skip_prob, 0); - s1 = vp9_cost_bit(skip_prob, 1); - - for (n = TX_4X4; n <= max_tx_size; n++) { - double scale = scale_rd[n]; - r[n][1] = r[n][0]; - for (m = 0; m <= n - (n == max_tx_size); m++) { - if (m == n) - r[n][1] += vp9_cost_zero(tx_probs[m]); - else - r[n][1] += vp9_cost_one(tx_probs[m]); - } - if (s[n]) { - rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n], - scale); - } else { - rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n], - scale); - rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n], - scale); - } - if (rd[n][1] < best_rd) { - best_rd = rd[n][1]; - best_tx = n; - } - } - - mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ? - best_tx : MIN(max_tx_size, max_mode_tx_size); - - // Actually encode using the chosen mode if a model was used, but do not - // update the r, d costs - txfm_rd_in_plane(x, rate, distortion, skip, - &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size, - cpi->sf.use_fast_coef_costing); - - if (max_tx_size == TX_32X32 && best_tx == TX_32X32) { - cpi->tx_stepdown_count[0]++; - } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) { - cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; - } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) { - cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; - } else { - cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; - } -} - static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *psse, BLOCK_SIZE bs, int64_t txfm_cache[TX_MODES], int64_t ref_best_rd) { - int r[TX_SIZES][2], s[TX_SIZES]; - int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const TX_SIZE max_tx_size = max_txsize_lookup[bs]; - TX_SIZE tx_size; - assert(bs == mbmi->sb_type); + assert(bs == xd->mi[0]->mbmi.sb_type); vp9_subtract_plane(x, bs, 0); - if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { + if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, - ref_best_rd, bs); - if (psse) - *psse = sse[mbmi->tx_size]; - return; - } - - if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) { - for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd, - &r[tx_size][0], &d[tx_size], &s[tx_size]); - choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s, - skip, sse, ref_best_rd, bs); + choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd, + bs); } else { - for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], - &s[tx_size], &sse[tx_size], - ref_best_rd, 0, bs, tx_size, - cpi->sf.use_fast_coef_costing); - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, bs); + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse, + txfm_cache, ref_best_rd, bs); } - if (psse) - *psse = sse[mbmi->tx_size]; } static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, - int64_t *psse, BLOCK_SIZE bs, + BLOCK_SIZE bs, int64_t txfm_cache[TX_MODES], int64_t ref_best_rd) { - int64_t sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int64_t sse; - assert(bs == mbmi->sb_type); - if (cpi->sf.tx_size_search_method != USE_FULL_RD) { + assert(bs == xd->mi[0]->mbmi.sb_type); + if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, - ref_best_rd, bs); + choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd, + bs); } else { - int r[TX_SIZES][2], s[TX_SIZES]; - int64_t d[TX_SIZES]; - TX_SIZE tx_size; - for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size) - txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], - &s[tx_size], &sse[tx_size], - ref_best_rd, 0, bs, tx_size, - cpi->sf.use_fast_coef_costing); - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, bs); + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse, + txfm_cache, ref_best_rd, bs); } - if (psse) - *psse = sse[mbmi->tx_size]; } -static int conditional_skipintra(MB_PREDICTION_MODE mode, - MB_PREDICTION_MODE best_intra_mode) { +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { if (mode == D117_PRED && best_intra_mode != V_PRED && best_intra_mode != D135_PRED) @@ -1040,13 +620,13 @@ static int conditional_skipintra(MB_PREDICTION_MODE mode, } static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, - MB_PREDICTION_MODE *best_mode, + PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion, BLOCK_SIZE bsize, int64_t rd_thresh) { - MB_PREDICTION_MODE mode; + PREDICTION_MODE mode; MACROBLOCKD *const xd = &x->e_mbd; int64_t best_rd = rd_thresh; @@ -1184,7 +764,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb, int tot_rate_y = 0; int64_t total_rd = 0; ENTROPY_CONTEXT t_above[4], t_left[4]; - const int *bmode_costs = mb->mbmode_cost; + const int *bmode_costs = cpi->mbmode_cost; vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above)); vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left)); @@ -1192,15 +772,15 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb, // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block. for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - MB_PREDICTION_MODE best_mode = DC_PRED; + PREDICTION_MODE best_mode = DC_PRED; int r = INT_MAX, ry = INT_MAX; int64_t d = INT64_MAX, this_rd = INT64_MAX; i = idy * 2 + idx; if (cpi->common.frame_type == KEY_FRAME) { - const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i); - const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i); + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i); - bmode_costs = mb->y_mode_costs[A][L]; + bmode_costs = cpi->y_mode_costs[A][L]; } this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs, @@ -1239,15 +819,15 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t tx_cache[TX_MODES], int64_t best_rd) { - MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE mode_selected = DC_PRED; + PREDICTION_MODE mode; + PREDICTION_MODE mode_selected = DC_PRED; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mic = xd->mi[0]; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd; TX_SIZE best_tx = TX_4X4; int i; - int *bmode_costs = x->mbmode_cost; + int *bmode_costs = cpi->mbmode_cost; if (cpi->sf.tx_size_search_method == USE_FULL_RD) for (i = 0; i < TX_MODES; i++) @@ -1259,19 +839,16 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, MODE_INFO *above_mi = xd->mi[-xd->mi_stride]; MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL; - if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode))) - continue; - if (cpi->common.frame_type == KEY_FRAME) { - const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); - const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); - bmode_costs = x->y_mode_costs[A][L]; + bmode_costs = cpi->y_mode_costs[A][L]; } mic->mbmi.mode = mode; intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, - &s, NULL, bsize, local_tx_cache, best_rd); + &s, bsize, local_tx_cache, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1312,7 +889,7 @@ static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); + const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]); int plane; int pnrate = 0, pnskip = 1; int64_t pndist = 0, pnsse = 0; @@ -1333,7 +910,7 @@ static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, - ref_best_rd, plane, bsize, uv_txfm_size, + ref_best_rd, plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing); if (pnrate == INT_MAX) goto term; @@ -1358,8 +935,8 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int64_t *distortion, int *skippable, BLOCK_SIZE bsize, TX_SIZE max_tx_size) { MACROBLOCKD *xd = &x->e_mbd; - MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE mode_selected = DC_PRED; + PREDICTION_MODE mode; + PREDICTION_MODE mode_selected = DC_PRED; int64_t best_rd = INT64_MAX, this_rd; int this_rate_tokenonly, this_rate, s; int64_t this_distortion, this_sse; @@ -1375,7 +952,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_rate_tokenonly == INT_MAX) continue; this_rate = this_rate_tokenonly + - x->intra_uv_mode_cost[cpi->common.frame_type][mode]; + cpi->intra_uv_mode_cost[cpi->common.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); if (this_rd < best_rd) { @@ -1385,27 +962,8 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; - if (!x->select_txfm_size) { - int i; - struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = xd->plane; - for (i = 1; i < MAX_MB_PLANE; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][2]; - p[i].qcoeff = ctx->qcoeff_pbuf[i][2]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; - p[i].eobs = ctx->eobs_pbuf[i][2]; - - ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0]; - ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0]; - ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0]; - ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0]; - - ctx->coeff_pbuf[i][0] = p[i].coeff; - ctx->qcoeff_pbuf[i][0] = p[i].qcoeff; - ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; - ctx->eobs_pbuf[i][0] = p[i].eobs; - } - } + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE); } } @@ -1423,7 +981,7 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED; super_block_uvrd(cpi, x, rate_tokenonly, distortion, skippable, &unused, bsize, INT64_MAX); - *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED]; + *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED]; return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } @@ -1431,7 +989,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, TX_SIZE max_tx_size, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, - MB_PREDICTION_MODE *mode_uv) { + PREDICTION_MODE *mode_uv) { MACROBLOCK *const x = &cpi->mb; // Use an estimated rd for uv_intra based on DC_PRED if the @@ -1449,18 +1007,10 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode; } -static int cost_mv_ref(const VP9_COMP *cpi, MB_PREDICTION_MODE mode, +static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode, int mode_context) { - const MACROBLOCK *const x = &cpi->mb; - const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id; - - // Don't account for mode here if segment skip is enabled. - if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { - assert(is_inter_mode(mode)); - return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; - } else { - return 0; - } + assert(is_inter_mode(mode)); + return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; } static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, @@ -1470,13 +1020,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv); -static int labels2mode(VP9_COMP *cpi, MACROBLOCKD *xd, int i, - MB_PREDICTION_MODE mode, - int_mv this_mv[2], - int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], - int_mv seg_mvs[MAX_REF_FRAMES], - int_mv *best_ref_mv[2], - const int *mvjcost, int *mvcost[2]) { +static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i, + PREDICTION_MODE mode, int_mv this_mv[2], + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], + int_mv seg_mvs[MAX_REF_FRAMES], + int_mv *best_ref_mv[2], const int *mvjcost, + int *mvcost[2]) { MODE_INFO *const mic = xd->mi[0]; const MB_MODE_INFO *const mbmi = &mic->mbmi; int thismvcost = 0; @@ -1485,8 +1034,6 @@ static int labels2mode(VP9_COMP *cpi, MACROBLOCKD *xd, int i, const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; const int is_compound = has_second_ref(mbmi); - // the only time we should do costing for new motion vector or mode - // is when we are on a new label (jbb May 08, 2007) switch (mode) { case NEWMV: this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int; @@ -1498,15 +1045,11 @@ static int labels2mode(VP9_COMP *cpi, MACROBLOCKD *xd, int i, mvjcost, mvcost, MV_COST_WEIGHT_SUB); } break; - case NEARESTMV: - this_mv[0].as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int; - if (is_compound) - this_mv[1].as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int; - break; case NEARMV: - this_mv[0].as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int; + case NEARESTMV: + this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int; if (is_compound) - this_mv[1].as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int; + this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int; break; case ZEROMV: this_mv[0].as_int = 0; @@ -1631,7 +1174,7 @@ typedef struct { int64_t d; int64_t sse; int segment_yrate; - MB_PREDICTION_MODE modes[4]; + PREDICTION_MODE modes[4]; SEG_RDSTAT rdstat[4][INTER_MODES]; int mvthresh; } BEST_SEG_INFO; @@ -1675,14 +1218,14 @@ static INLINE int mv_has_subpel(const MV *mv) { static int check_best_zero_mv( const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES], int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], - int disable_inter_mode_mask, int this_mode, int ref_frame, - int second_ref_frame) { - if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) && + int inter_mode_mask, int this_mode, + const MV_REFERENCE_FRAME ref_frames[2]) { + if ((inter_mode_mask & (1 << ZEROMV)) && (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && - frame_mv[this_mode][ref_frame].as_int == 0 && - (second_ref_frame == NONE || - frame_mv[this_mode][second_ref_frame].as_int == 0)) { - int rfc = mode_context[ref_frame]; + frame_mv[this_mode][ref_frames[0]].as_int == 0 && + (ref_frames[1] == NONE || + frame_mv[this_mode][ref_frames[1]].as_int == 0)) { + int rfc = mode_context[ref_frames[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); int c3 = cost_mv_ref(cpi, ZEROMV, rfc); @@ -1693,15 +1236,15 @@ static int check_best_zero_mv( if (c2 > c3) return 0; } else { assert(this_mode == ZEROMV); - if (second_ref_frame == NONE) { - if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0) || - (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0)) + if (ref_frames[1] == NONE) { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) return 0; } else { - if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0 && - frame_mv[NEARESTMV][second_ref_frame].as_int == 0) || - (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0 && - frame_mv[NEARMV][second_ref_frame].as_int == 0)) + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARMV][ref_frames[1]].as_int == 0)) return 0; } } @@ -1709,18 +1252,28 @@ static int check_best_zero_mv( return 1; } -static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - BEST_SEG_INFO *bsi_buf, int filter_idx, - int_mv seg_mvs[4][MAX_REF_FRAMES], - int mi_row, int mi_col) { +static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo * const tile, + int_mv *best_ref_mv, + int_mv *second_best_ref_mv, + int64_t best_rd, int *returntotrate, + int *returnyrate, + int64_t *returndistortion, + int *skippable, int64_t *psse, + int mvthresh, + int_mv seg_mvs[4][MAX_REF_FRAMES], + BEST_SEG_INFO *bsi_buf, int filter_idx, + int mi_row, int mi_col) { + int i; + BEST_SEG_INFO *bsi = bsi_buf + filter_idx; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + MB_MODE_INFO *mbmi = &mi->mbmi; + int mode_idx; int k, br = 0, idx, idy; int64_t bd = 0, block_sse = 0; - MB_PREDICTION_MODE this_mode; - MACROBLOCKD *xd = &x->e_mbd; + PREDICTION_MODE this_mode; VP9_COMMON *cm = &cpi->common; - MODE_INFO *mi = xd->mi[0]; - MB_MODE_INFO *const mbmi = &mi->mbmi; struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; const int label_count = 4; @@ -1730,13 +1283,21 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = mbmi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize]; ENTROPY_CONTEXT t_above[2], t_left[2]; - BEST_SEG_INFO *bsi = bsi_buf + filter_idx; - int mode_idx; int subpelmv = 1, have_ref = 0; const int has_second_rf = has_second_ref(mbmi); - const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize]; + const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; + + vp9_zero(*bsi); + + bsi->segment_rd = best_rd; + bsi->ref_mv[0] = best_ref_mv; + bsi->ref_mv[1] = second_best_ref_mv; + bsi->mvp.as_int = best_ref_mv->as_int; + bsi->mvthresh = mvthresh; + + for (i = 0; i < 4; i++) + bsi->modes[i] = ZEROMV; vpx_memcpy(t_above, pd->above_context, sizeof(t_above)); vpx_memcpy(t_left, pd->left_context, sizeof(t_left)); @@ -1754,7 +1315,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop int_mv mode_mv[MB_MODE_COUNT][2]; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - MB_PREDICTION_MODE mode_selected = ZEROMV; + PREDICTION_MODE mode_selected = ZEROMV; int64_t best_rd = INT64_MAX; const int i = idy * 2 + idx; int ref; @@ -1774,13 +1335,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, mode_idx = INTER_OFFSET(this_mode); bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; - if (disable_inter_mode_mask & (1 << mode_idx)) + if (!(inter_mode_mask & (1 << this_mode))) continue; if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, - disable_inter_mode_mask, - this_mode, mbmi->ref_frame[0], - mbmi->ref_frame[1])) + inter_mode_mask, + this_mode, mbmi->ref_frame)) continue; vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre)); @@ -1792,9 +1352,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // motion search for newmv (single predictor case only) if (!has_second_rf && this_mode == NEWMV && seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) { - int_mv *const new_mv = &mode_mv[NEWMV][0]; + MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; - int further_steps; int thissme, bestsme = INT_MAX; int sadpb = x->sadperbit4; MV mvp_full; @@ -1805,8 +1364,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (best_rd < label_mv_thresh) break; - if (cpi->oxcf.mode != MODE_SECONDPASS_BEST && - cpi->oxcf.mode != MODE_BESTQUALITY) { + if (!is_best_mode(cpi->oxcf.mode)) { // use previous block's result as next block's MV predictor. if (i > 0) { bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; @@ -1819,12 +1377,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, else max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3; - if (cpi->sf.auto_mv_step_size && cm->show_frame) { + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { // Take wtd average of the step_params based on the last frame's // max mv magnitude and the best ref mvs of the current block for // the given reference. - step_param = (vp9_init_search_range(cpi, max_mv) + - cpi->mv_step_param) >> 1; + step_param = (vp9_init_search_range(max_mv) + + cpi->mv_step_param) / 2; } else { step_param = cpi->mv_step_param; } @@ -1833,95 +1391,60 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col = bsi->mvp.as_mv.col >> 3; if (cpi->sf.adaptive_motion_search && cm->show_frame) { - mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3; - mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3; + mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3; step_param = MAX(step_param, 8); } - further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; // adjust src pointer for this block mi_buf_shift(x, i); vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv); - if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv[0]->as_mv, - &new_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv, - &bsi->ref_mv[0]->as_mv, - v_fn_ptr, 1); - } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv[0]->as_mv, - &new_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv, - &bsi->ref_mv[0]->as_mv, - v_fn_ptr, 1); - } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv[0]->as_mv, - &new_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv, - &bsi->ref_mv[0]->as_mv, - v_fn_ptr, 1); - } else { - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 0, v_fn_ptr, - &bsi->ref_mv[0]->as_mv, - &new_mv->as_mv); - } + bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, + sadpb, &bsi->ref_mv[0]->as_mv, new_mv, + INT_MAX, 1); // Should we do a full search (best quality only) - if (cpi->oxcf.mode == MODE_BESTQUALITY || - cpi->oxcf.mode == MODE_SECONDPASS_BEST) { + if (is_best_mode(cpi->oxcf.mode)) { int_mv *const best_mv = &mi->bmi[i].as_mv[0]; /* Check if mvp_full is within the range. */ clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); thissme = cpi->full_search_sad(x, &mvp_full, - sadpb, 16, v_fn_ptr, - x->nmvjointcost, x->mvcost, + sadpb, 16, &cpi->fn_ptr[bsize], &bsi->ref_mv[0]->as_mv, &best_mv->as_mv); if (thissme < bestsme) { bestsme = thissme; - new_mv->as_int = best_mv->as_int; + *new_mv = best_mv->as_mv; } else { // The full search result is actually worse so re-instate the // previous best vector - best_mv->as_int = new_mv->as_int; + best_mv->as_mv = *new_mv; } } if (bestsme < INT_MAX) { int distortion; cpi->find_fractional_mv_step(x, - &new_mv->as_mv, + new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, - x->errorperbit, v_fn_ptr, - cpi->sf.subpel_force_stop, - cpi->sf.subpel_iters_per_step, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &distortion, - &x->pred_sse[mbmi->ref_frame[0]]); + &x->pred_sse[mbmi->ref_frame[0]], + NULL, 0, 0); // save motion search result for use in compound prediction - seg_mvs[i][mbmi->ref_frame[0]].as_int = new_mv->as_int; + seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv; } if (cpi->sf.adaptive_motion_search) - x->pred_mv[mbmi->ref_frame[0]].as_int = new_mv->as_int; + x->pred_mv[mbmi->ref_frame[0]] = *new_mv; // restore src pointers mi_buf_restore(x, orig_src, orig_pre); @@ -1952,8 +1475,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } bsi->rdstat[i][mode_idx].brate = - labels2mode(cpi, xd, i, this_mode, mode_mv[this_mode], frame_mv, - seg_mvs[i], bsi->ref_mv, x->nmvjointcost, x->mvcost); + set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode], + frame_mv, seg_mvs[i], bsi->ref_mv, + x->nmvjointcost, x->mvcost); for (ref = 0; ref < 1 + has_second_rf; ++ref) { bsi->rdstat[i][mode_idx].mvs[ref].as_int = @@ -2042,16 +1566,16 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; - return; + return INT64_MAX;; } mode_idx = INTER_OFFSET(mode_selected); vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); - labels2mode(cpi, xd, i, mode_selected, mode_mv[mode_selected], - frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost, - x->mvcost); + set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected], + frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost, + x->mvcost); br += bsi->rdstat[i][mode_idx].brate; bd += bsi->rdstat[i][mode_idx].bdist; @@ -2065,7 +1589,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; - return; + return INT64_MAX;; } } } /* for each label */ @@ -2079,42 +1603,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // update the coding decisions for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode; -} - -static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - int_mv *best_ref_mv, - int_mv *second_best_ref_mv, - int64_t best_rd, - int *returntotrate, - int *returnyrate, - int64_t *returndistortion, - int *skippable, int64_t *psse, - int mvthresh, - int_mv seg_mvs[4][MAX_REF_FRAMES], - BEST_SEG_INFO *bsi_buf, - int filter_idx, - int mi_row, int mi_col) { - int i; - BEST_SEG_INFO *bsi = bsi_buf + filter_idx; - MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; - int mode_idx; - - vp9_zero(*bsi); - - bsi->segment_rd = best_rd; - bsi->ref_mv[0] = best_ref_mv; - bsi->ref_mv[1] = second_best_ref_mv; - bsi->mvp.as_int = best_ref_mv->as_int; - bsi->mvthresh = mvthresh; - - for (i = 0; i < 4; i++) - bsi->modes[i] = ZEROMV; - - rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs, - mi_row, mi_col); if (bsi->segment_rd > best_rd) return INT64_MAX; @@ -2141,72 +1629,12 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, return bsi->segment_rd; } -static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, - uint8_t *ref_y_buffer, int ref_y_stride, - int ref_frame, BLOCK_SIZE block_size ) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - int_mv this_mv; - int i; - int zero_seen = 0; - int best_index = 0; - int best_sad = INT_MAX; - int this_sad = INT_MAX; - int max_mv = 0; - - uint8_t *src_y_ptr = x->plane[0].src.buf; - uint8_t *ref_y_ptr; - int row_offset, col_offset; - int num_mv_refs = MAX_MV_REF_CANDIDATES + - (cpi->sf.adaptive_motion_search && - cpi->common.show_frame && - block_size < cpi->sf.max_partition_size); - - int_mv pred_mv[3]; - pred_mv[0] = mbmi->ref_mvs[ref_frame][0]; - pred_mv[1] = mbmi->ref_mvs[ref_frame][1]; - pred_mv[2] = x->pred_mv[ref_frame]; - - // Get the sad for each candidate reference mv - for (i = 0; i < num_mv_refs; i++) { - this_mv.as_int = pred_mv[i].as_int; - - max_mv = MAX(max_mv, - MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); - // only need to check zero mv once - if (!this_mv.as_int && zero_seen) - continue; - - zero_seen = zero_seen || !this_mv.as_int; - - row_offset = this_mv.as_mv.row >> 3; - col_offset = this_mv.as_mv.col >> 3; - ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset; - - // Find sad for current vector. - this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, - ref_y_ptr, ref_y_stride, - 0x7fffffff); - - // Note if it is the best so far. - if (this_sad < best_sad) { - best_sad = this_sad; - best_index = i; - } - } - - // Note the index of the mv that worked best in the reference list. - x->mv_best_ref_index[ref_frame] = best_index; - x->max_mv_context[ref_frame] = max_mv; - x->pred_mv_sad[ref_frame] = best_sad; -} - -static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, +static void estimate_ref_frame_costs(const VP9_COMMON *cm, + const MACROBLOCKD *xd, + int segment_id, unsigned int *ref_costs_single, unsigned int *ref_costs_comp, vp9_prob *comp_mode_p) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { @@ -2264,10 +1692,8 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, - int_mv *ref_mv, - int_mv *second_ref_mv, int64_t comp_pred_diff[REFERENCE_MODES], - int64_t tx_size_diff[TX_MODES], + const int64_t tx_size_diff[TX_MODES], int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) { MACROBLOCKD *const xd = &x->e_mbd; @@ -2276,10 +1702,6 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->skip = x->skip; ctx->best_mode_index = mode_index; ctx->mic = *xd->mi[0]; - - ctx->best_ref_mv[0].as_int = ref_mv->as_int; - ctx->best_ref_mv[1].as_int = second_ref_mv->as_int; - ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE]; ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE]; ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT]; @@ -2289,40 +1711,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS); } -static void setup_pred_block(const MACROBLOCKD *xd, - struct buf_2d dst[MAX_MB_PLANE], - const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col, - const struct scale_factors *scale, - const struct scale_factors *scale_uv) { - int i; - - dst[0].buf = src->y_buffer; - dst[0].stride = src->y_stride; - dst[1].buf = src->u_buffer; - dst[2].buf = src->v_buffer; - dst[1].stride = dst[2].stride = src->uv_stride; -#if CONFIG_ALPHA - dst[3].buf = src->alpha_buffer; - dst[3].stride = src->alpha_stride; -#endif - - // TODO(jkoleszar): Make scale factors per-plane data - for (i = 0; i < MAX_MB_PLANE; i++) { - setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col, - i ? scale_uv : scale, - xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); - } -} - -void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - MV_REFERENCE_FRAME ref_frame, - BLOCK_SIZE block_size, - int mi_row, int mi_col, - int_mv frame_nearest_mv[MAX_REF_FRAMES], - int_mv frame_near_mv[MAX_REF_FRAMES], - struct buf_2d yv12_mb[4][MAX_MB_PLANE]) { +static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, + MV_REFERENCE_FRAME ref_frame, + BLOCK_SIZE block_size, + int mi_row, int mi_col, + int_mv frame_nearest_mv[MAX_REF_FRAMES], + int_mv frame_near_mv[MAX_REF_FRAMES], + struct buf_2d yv12_mb[4][MAX_MB_PLANE]) { const VP9_COMMON *cm = &cpi->common; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); MACROBLOCKD *const xd = &x->e_mbd; @@ -2332,7 +1728,7 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. - setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); // Gets an initial list of candidate vectors from neighbours and orders them vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col); @@ -2346,37 +1742,20 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8) - mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, - ref_frame, block_size); -} - -const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, - int ref_frame) { - const VP9_COMMON *const cm = &cpi->common; - const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; - const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; - return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL; -} - -int vp9_get_switchable_rate(const MACROBLOCK *x) { - const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - const int ctx = vp9_get_pred_context_switchable_interp(xd); - return SWITCHABLE_INTERP_RATE_FACTOR * - x->switchable_interp_costs[ctx][mbmi->interp_filter]; + vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, + ref_frame, block_size); } static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *cm = &cpi->common; + const VP9_COMMON *cm = &cpi->common; MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}}; int bestsme = INT_MAX; - int further_steps, step_param; + int step_param; int sadpb = x->sadperbit16; MV mvp_full; int ref = mbmi->ref_frame[0]; @@ -2393,7 +1772,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, MV pred_mv[3]; pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv; pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv; - pred_mv[2] = x->pred_mv[ref].as_mv; + pred_mv[2] = x->pred_mv[ref]; if (scaled_ref_frame) { int i; @@ -2410,35 +1789,36 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // Work out the size of the first step in the mv step search. // 0 here is maximum length first step. 1 is MAX >> 1 etc. - if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { // Take wtd average of the step_params based on the last frame's // max mv magnitude and that based on the best ref mvs of the current // block for the given reference. - step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + - cpi->mv_step_param) >> 1; + step_param = (vp9_init_search_range(x->max_mv_context[ref]) + + cpi->mv_step_param) / 2; } else { step_param = cpi->mv_step_param; } if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 && - cpi->common.show_frame) { + cm->show_frame) { int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize), b_width_log2(bsize))); step_param = MAX(step_param, boffset); } if (cpi->sf.adaptive_motion_search) { - int bwl = b_width_log2_lookup[bsize]; - int bhl = b_height_log2_lookup[bsize]; + int bwl = b_width_log2(bsize); + int bhl = b_height_log2(bsize); int i; int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); if (tlevel < 5) step_param += 2; - for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) { + for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { - x->pred_mv[ref].as_int = 0; + x->pred_mv[ref].row = 0; + x->pred_mv[ref].col = 0; tmp_mv->as_int = INVALID_MV; if (scaled_ref_frame) { @@ -2456,50 +1836,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - // Further step/diamond searches as necessary - further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - - if (cpi->sf.search_method == FAST_DIAMOND) { - bestsme = vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == FAST_HEX) { - bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else { - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[bsize], - &ref_mv, &tmp_mv->as_mv); - } + bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -2512,16 +1850,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], - cpi->sf.subpel_force_stop, - cpi->sf.subpel_iters_per_step, + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, - &dis, &x->pred_sse[ref]); + &dis, &x->pred_sse[ref], NULL, 0, 0); } *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) - x->pred_mv[ref].as_int = tmp_mv->as_int; + if (cpi->sf.adaptive_motion_search && cm->show_frame) + x->pred_mv[ref] = tmp_mv->as_mv; if (scaled_ref_frame) { int i; @@ -2580,7 +1918,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; int sadpb = x->sadperbit16; - int_mv tmp_mv; + MV tmp_mv; int search_range = 3; int tmp_col_min = x->mv_col_min; @@ -2609,20 +1947,18 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, vp9_set_mv_search_range(x, &ref_mv[id].as_mv); // Use mv result from single mode as mvp. - tmp_mv.as_int = frame_mv[refs[id]].as_int; + tmp_mv = frame_mv[refs[id]].as_mv; - tmp_mv.as_mv.col >>= 3; - tmp_mv.as_mv.row >>= 3; + tmp_mv.col >>= 3; + tmp_mv.row >>= 3; // Small-range full-pixel motion search - bestsme = vp9_refining_search_8p_c(x, &tmp_mv.as_mv, sadpb, + bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb, search_range, &cpi->fn_ptr[bsize], - x->nmvjointcost, x->mvcost, - &ref_mv[id].as_mv, second_pred, - pw, ph); + &ref_mv[id].as_mv, second_pred); if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_av_var(x, &tmp_mv.as_mv, &ref_mv[id].as_mv, + bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv, second_pred, &cpi->fn_ptr[bsize], 1); x->mv_col_min = tmp_col_min; @@ -2633,13 +1969,13 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; - bestsme = cpi->find_fractional_mv_step_comp( - x, &tmp_mv.as_mv, + bestsme = cpi->find_fractional_mv_step( + x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], - 0, cpi->sf.subpel_iters_per_step, + 0, cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, pw, ph); @@ -2649,7 +1985,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[0].pre[0] = scaled_first_yv12; if (bestsme < last_besterr[id]) { - frame_mv[refs[id]].as_int = tmp_mv.as_int; + frame_mv[refs[id]].as_mv = tmp_mv; last_besterr[id] = bestsme; } else { break; @@ -2684,26 +2020,104 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, } } +static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int *rate2, + int64_t *distortion, int64_t *distortion_uv, + int *disable_skip) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var, sse; + // Skipping threshold for ac. + unsigned int thresh_ac; + // Skipping threshold for dc + unsigned int thresh_dc; + + var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, + xd->plane[0].dst.stride, &sse); + + if (x->encode_breakout > 0) { + // Set a maximum for threshold to avoid big PSNR loss in low bitrate + // case. Use extreme low threshold for static frames to limit skipping. + const unsigned int max_thresh = (cpi->allow_encode_breakout == + ENCODE_BREAKOUT_LIMITED) ? 128 : 36000; + // The encode_breakout input + const unsigned int min_thresh = + MIN(((unsigned int)x->encode_breakout << 4), max_thresh); + + // Calculate threshold according to dequant value. + thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); + + // Adjust threshold according to partition size. + thresh_ac >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); + thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); + } else { + thresh_ac = 0; + thresh_dc = 0; + } + + // Y skipping condition checking + if (sse < thresh_ac || sse == 0) { + // dc skipping checking + if ((sse - var) < thresh_dc || sse == var) { + unsigned int sse_u, sse_v; + unsigned int var_u, var_v; + + var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, + x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse_u); + + // U skipping condition checking + if ((sse_u * 4 < thresh_ac || sse_u == 0) && + (sse_u - var_u < thresh_dc || sse_u == var_u)) { + var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, + x->plane[2].src.stride, + xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse_v); + + // V skipping condition checking + if ((sse_v * 4 < thresh_ac || sse_v == 0) && + (sse_v - var_v < thresh_dc || sse_v == var_v)) { + x->skip = 1; + + // The cost of skip bit needs to be added. + *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + // Scaling factor for SSE from spatial domain to frequency domain + // is 16. Adjust distortion accordingly. + *distortion_uv = (sse_u + sse_v) << 4; + *distortion = (sse << 4) + *distortion_uv; + + *disable_skip = 1; + } + } + } + } +} + static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, BLOCK_SIZE bsize, int64_t txfm_cache[], int *rate2, int64_t *distortion, int *skippable, int *rate_y, int64_t *distortion_y, int *rate_uv, int64_t *distortion_uv, - int *mode_excluded, int *disable_skip, - INTERP_FILTER *best_filter, + int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], int64_t *psse, const int64_t ref_best_rd) { VP9_COMMON *cm = &cpi->common; + RD_OPT *rd_opt = &cpi->rd; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; const int is_comp_pred = has_second_ref(mbmi); - const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; int_mv *frame_mv = mode_mv[this_mode]; int i; @@ -2719,6 +2133,25 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; int rs = 0; + INTERP_FILTER best_filter = SWITCHABLE; + int skip_txfm[MAX_MB_PLANE] = {0}; + int64_t bsse[MAX_MB_PLANE] = {0}; + + int bsl = mi_width_log2_lookup[bsize]; + int pred_filter_search = cpi->sf.cb_pred_filter_search ? + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; + + if (pred_filter_search) { + INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE; + if (xd->up_available) + af = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + if (xd->left_available) + lf = xd->mi[-1]->mbmi.interp_filter; + + if ((this_mode != NEWMV) || (af == lf)) + best_filter = af; + } if (is_comp_pred) { if (frame_mv[refs[0]].as_int == INVALID_MV || @@ -2747,7 +2180,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate2 += rate_mv; } else { int_mv tmp_mv; - single_motion_search(cpi, x, tile, bsize, mi_row, mi_col, + single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; @@ -2758,7 +2191,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - for (i = 0; i < num_refs; ++i) { + for (i = 0; i < is_comp_pred + 1; ++i) { cur_mv[i] = frame_mv[refs[i]]; // Clip "next_nearest" so that it does not extend to far out of image if (this_mode != NEWMV) @@ -2785,10 +2218,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, * if the first is known */ *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); - if (!(*mode_excluded)) - *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE - : cm->reference_mode == COMPOUND_REFERENCE; - pred_exists = 0; // Are all MVs integer pel for Y and UV intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv); @@ -2797,16 +2226,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used - cpi->mask_filter_rd = 0; + rd_opt->mask_filter = 0; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) - cpi->rd_filter_cache[i] = INT64_MAX; + rd_opt->filter_cache[i] = INT64_MAX; if (cm->interp_filter != BILINEAR) { - *best_filter = EIGHTTAP; - if (x->source_variance < - cpi->sf.disable_filter_search_var_thresh) { - *best_filter = EIGHTTAP; - } else { + if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { + best_filter = EIGHTTAP; + } else if (best_filter == SWITCHABLE) { int newbest; int tmp_rate_sum = 0; int64_t tmp_dist_sum = 0; @@ -2815,17 +2242,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int j; int64_t rs_rd; mbmi->interp_filter = i; - rs = vp9_get_switchable_rate(x); + rs = vp9_get_switchable_rate(cpi); rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); if (i > 0 && intpel_mv) { rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum); - cpi->rd_filter_cache[i] = rd; - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); + rd_opt->filter_cache[i] = rd; + rd_opt->filter_cache[SWITCHABLE_FILTERS] = + MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); if (cm->interp_filter == SWITCHABLE) rd += rs_rd; - cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd); + rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd); } else { int rate_sum = 0; int64_t dist_sum = 0; @@ -2845,12 +2272,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); - cpi->rd_filter_cache[i] = rd; - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); + rd_opt->filter_cache[i] = rd; + rd_opt->filter_cache[SWITCHABLE_FILTERS] = + MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); if (cm->interp_filter == SWITCHABLE) rd += rs_rd; - cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd); + rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd); if (i == 0 && intpel_mv) { tmp_rate_sum = rate_sum; @@ -2868,9 +2295,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (newbest) { best_rd = rd; - *best_filter = mbmi->interp_filter; + best_filter = mbmi->interp_filter; if (cm->interp_filter == SWITCHABLE && i && !intpel_mv) best_needs_copy = !best_needs_copy; + vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(bsse, x->bsse, sizeof(bsse)); } if ((cm->interp_filter == SWITCHABLE && newbest) || @@ -2884,8 +2313,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } // Set the appropriate filter mbmi->interp_filter = cm->interp_filter != SWITCHABLE ? - cm->interp_filter : *best_filter; - rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(x) : 0; + cm->interp_filter : best_filter; + rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0; if (pred_exists) { if (best_needs_copy) { @@ -2915,87 +2344,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if (cm->interp_filter == SWITCHABLE) - *rate2 += vp9_get_switchable_rate(x); + *rate2 += vp9_get_switchable_rate(cpi); if (!is_comp_pred) { - if (!x->in_active_map) { - if (psse) - *psse = 0; - *distortion = 0; - x->skip = 1; - } else if (cpi->allow_encode_breakout && x->encode_breakout) { - const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); - const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); - unsigned int var, sse; - // Skipping threshold for ac. - unsigned int thresh_ac; - // Set a maximum for threshold to avoid big PSNR loss in low bitrate case. - // Use extreme low threshold for static frames to limit skipping. - const unsigned int max_thresh = (cpi->allow_encode_breakout == - ENCODE_BREAKOUT_LIMITED) ? 128 : 36000; - // The encode_breakout input - const unsigned int min_thresh = - MIN(((unsigned int)x->encode_breakout << 4), max_thresh); - - // Calculate threshold according to dequant value. - thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; - thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); - - var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride, &sse); - - // Adjust threshold according to partition size. - thresh_ac >>= 8 - (b_width_log2_lookup[bsize] + - b_height_log2_lookup[bsize]); - - // Y skipping condition checking - if (sse < thresh_ac || sse == 0) { - // Skipping threshold for dc - unsigned int thresh_dc; - - thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); - - // dc skipping checking - if ((sse - var) < thresh_dc || sse == var) { - unsigned int sse_u, sse_v; - unsigned int var_u, var_v; - - var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, - x->plane[1].src.stride, - xd->plane[1].dst.buf, - xd->plane[1].dst.stride, &sse_u); - - // U skipping condition checking - if ((sse_u * 4 < thresh_ac || sse_u == 0) && - (sse_u - var_u < thresh_dc || sse_u == var_u)) { - var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, - x->plane[2].src.stride, - xd->plane[2].dst.buf, - xd->plane[2].dst.stride, &sse_v); - - // V skipping condition checking - if ((sse_v * 4 < thresh_ac || sse_v == 0) && - (sse_v - var_v < thresh_dc || sse_v == var_v)) { - x->skip = 1; - - // The cost of skip bit needs to be added. - *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); - - // Scaling factor for SSE from spatial domain to frequency domain - // is 16. Adjust distortion accordingly. - *distortion_uv = (sse_u + sse_v) << 4; - *distortion = (sse << 4) + *distortion_uv; - - *disable_skip = 1; - this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); - } - } - } - } - } + if (cpi->allow_encode_breakout) + rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv, + disable_skip); } + vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(x->bsse, bsse, sizeof(bsse)); + if (!x->skip) { int skippable_y, skippable_uv; int64_t sseuv = INT64_MAX; @@ -3037,36 +2396,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, return this_rd; // if 0, this will be re-calculated by caller } -static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, - int max_plane) { - struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = x->e_mbd.plane; - int i; - - for (i = 0; i < max_plane; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][1]; - p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - p[i].eobs = ctx->eobs_pbuf[i][1]; - - ctx->coeff_pbuf[i][1] = ctx->coeff_pbuf[i][0]; - ctx->qcoeff_pbuf[i][1] = ctx->qcoeff_pbuf[i][0]; - ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0]; - ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0]; - - ctx->coeff_pbuf[i][0] = p[i].coeff; - ctx->qcoeff_pbuf[i][0] = p[i].qcoeff; - ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; - ctx->eobs_pbuf[i][0] = p[i].eobs; - } -} - void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int *returnrate, int64_t *returndist, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = xd->plane; int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; int y_skip = 0, uv_skip = 0; int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 }; @@ -3082,7 +2418,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } - max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize); + max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize, + pd[1].subsampling_x, + pd[1].subsampling_y); rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip, bsize, max_uv_tx_size); } else { @@ -3092,7 +2430,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } - max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize); + max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize, + pd[1].subsampling_x, + pd[1].subsampling_y); rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size); } @@ -3118,6 +2458,29 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, ctx->mic = *xd->mi[0]; } +// Updating rd_thresh_freq_fact[] here means that the different +// partition/block sizes are handled independently based on the best +// choice for the current partition. It may well be better to keep a scaled +// best rd so far value and update rd_thresh_freq_fact based on the mode/size +// combination that wins out. +static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize, + int best_mode_index) { + if (cpi->sf.adaptive_rd_thresh > 0) { + const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; + int mode; + for (mode = 0; mode < top_mode; ++mode) { + int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode]; + + if (mode == best_mode_index) { + *fact -= (*fact >> 3); + } else { + *fact = MIN(*fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } + } + } +} + int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, int mi_row, int mi_col, @@ -3127,10 +2490,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; const struct segmentation *const seg = &cm->seg; - MB_PREDICTION_MODE this_mode; + struct macroblockd_plane *const pd = xd->plane; + PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; int comp_pred, i; @@ -3146,19 +2511,18 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_pred_rd[REFERENCE_MODES]; int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - MB_MODE_INFO best_mbmode = { 0 }; - int mode_index, best_mode_index = 0; + MB_MODE_INFO best_mbmode; + int mode_index, best_mode_index = -1; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; int64_t best_intra_rd = INT64_MAX; int64_t best_inter_rd = INT64_MAX; - MB_PREDICTION_MODE best_intra_mode = DC_PRED; + PREDICTION_MODE best_intra_mode = DC_PRED; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERP_FILTER tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; - MB_PREDICTION_MODE mode_uv[TX_SIZES]; + PREDICTION_MODE mode_uv[TX_SIZES]; int64_t mode_distortions[MB_MODE_COUNT] = {-1}; int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; @@ -3166,16 +2530,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int best_skip2 = 0; int mode_skip_mask = 0; int mode_skip_start = cpi->sf.mode_skip_start + 1; - const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize]; - const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize]; + const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize]; const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags; const int intra_y_mode_mask = cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]; - int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize]; - + int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; + vp9_zero(best_mbmode); x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; - estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, &comp_mode_p); for (i = 0; i < REFERENCE_MODES; ++i) @@ -3194,9 +2558,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { - vp9_setup_buffer_inter(cpi, x, tile, - ref_frame, bsize, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; @@ -3232,13 +2595,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - // If the segment skip feature is enabled.... - // then do nothing if the current mode is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { - const int inter_non_zero_mode_mask = 0x1F7F7; - mode_skip_mask |= inter_non_zero_mode_mask; - } - // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. @@ -3248,9 +2604,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - const int altref_zero_mask = + mode_skip_mask = ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA)); - mode_skip_mask |= altref_zero_mask; if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0) mode_skip_mask |= (1 << THR_NEARA); if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0) @@ -3271,21 +2626,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (bsize > cpi->sf.max_intra_bsize) { - mode_skip_mask |= 0xFF30808; - } - - if (!x->in_active_map) { - int mode_index; - assert(cpi->ref_frame_flags & VP9_LAST_FLAG); - if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0) - mode_index = THR_NEARESTMV; - else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0) - mode_index = THR_NEARMV; - else - mode_index = THR_ZEROMV; - mode_skip_mask = ~(1 << mode_index); - mode_skip_start = MAX_MODES; - disable_inter_mode_mask = 0; + const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) | + (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) | + (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) | + (1 << THR_D117_PRED) | (1 << THR_D45_PRED); + mode_skip_mask |= all_intra_modes; } for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { @@ -3304,7 +2649,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. - if (mode_index == mode_skip_start) { + if (mode_index == mode_skip_start && best_mode_index >= 0) { switch (vp9_mode_order[best_mode_index].ref_frame[0]) { case INTRA_FRAME: break; @@ -3320,27 +2665,76 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, case NONE: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); + break; } } if (mode_skip_mask & (1 << mode_index)) continue; // Test best rd so far against threshold for trying this mode. - if (best_rd < ((int64_t)rd_threshes[mode_index] * - rd_thresh_freq_fact[mode_index] >> 5) || - rd_threshes[mode_index] == INT_MAX) - continue; + if (rd_less_than_thresh(best_rd, rd_threshes[mode_index], + rd_thresh_freq_fact[mode_index])) + continue; this_mode = vp9_mode_order[mode_index].mode; ref_frame = vp9_mode_order[mode_index].ref_frame[0]; - if (ref_frame != INTRA_FRAME && - disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode))) + if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode))) continue; second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; + if (cpi->sf.motion_field_mode_search) { + const int mi_width = MIN(num_8x8_blocks_wide_lookup[bsize], + tile->mi_col_end - mi_col); + const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize], + tile->mi_row_end - mi_row); + const int bsl = mi_width_log2(bsize); + int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + MB_MODE_INFO *ref_mbmi; + int const_motion = 1; + int skip_ref_frame = !cb_partition_search_ctrl; + MV_REFERENCE_FRAME rf = NONE; + int_mv ref_mv; + ref_mv.as_int = INVALID_MV; + + if ((mi_row - 1) >= tile->mi_row_start) { + ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0]; + rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0]; + for (i = 0; i < mi_width; ++i) { + ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi; + const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) && + (ref_frame == ref_mbmi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]); + } + } + + if ((mi_col - 1) >= tile->mi_col_start) { + if (ref_mv.as_int == INVALID_MV) + ref_mv = xd->mi[-1]->mbmi.mv[0]; + if (rf == NONE) + rf = xd->mi[-1]->mbmi.ref_frame[0]; + for (i = 0; i < mi_height; ++i) { + ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi; + const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) && + (ref_frame == ref_mbmi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]); + } + } + + if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV) + if (rf > INTRA_FRAME) + if (ref_frame != rf) + continue; + + if (const_motion) + if (this_mode == NEARMV || this_mode == ZEROMV) + continue; + } + comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + best_mode_index >=0 && vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) continue; if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) && @@ -3368,7 +2762,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // one of the neighboring directional modes if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && (this_mode >= D45_PRED && this_mode <= TM_PRED)) { - if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME) + if (best_mode_index >= 0 && + vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME) continue; } if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { @@ -3377,16 +2772,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } } else { - if (x->in_active_map && - !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) - if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, - disable_inter_mode_mask, this_mode, ref_frame, - second_ref_frame)) - continue; + const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame}; + if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, + inter_mode_mask, this_mode, ref_frames)) + continue; } mbmi->mode = this_mode; - mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode; + mbmi->uv_mode = DC_PRED; mbmi->ref_frame[0] = ref_frame; mbmi->ref_frame[1] = second_ref_frame; // Evaluate all sub-pel filters irrespective of whether we can use @@ -3406,21 +2799,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < TX_MODES; ++i) tx_cache[i] = INT64_MAX; -#ifdef MODE_TEST_HIT_STATS - // TEST/DEBUG CODE - // Keep a rcord of the number of test hits at each size - cpi->mode_test_hits[bsize]++; -#endif - if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, + intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, bsize, tx_cache, best_rd); if (rate_y == INT_MAX) continue; - uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize); + uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd[1].subsampling_x, + pd[1].subsampling_y); if (rate_uv_intra[uv_tx] == INT_MAX) { choose_intra_uv_mode(cpi, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], @@ -3432,18 +2820,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, skippable = skippable && skip_uv[uv_tx]; mbmi->uv_mode = mode_uv[uv_tx]; - rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; + rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; if (this_mode != DC_PRED && this_mode != TM_PRED) rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { - this_rd = handle_inter_mode(cpi, x, tile, bsize, + this_rd = handle_inter_mode(cpi, x, bsize, tx_cache, &rate2, &distortion2, &skippable, &rate_y, &distortion_y, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, - &tmp_best_filter, frame_mv, + &disable_skip, frame_mv, mi_row, mi_col, single_newmv, &total_sse, best_rd); if (this_rd == INT64_MAX) @@ -3464,31 +2851,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (!disable_skip) { - // Test for the condition where skip block will be activated - // because there are no non zero coefficients and make any - // necessary adjustment for rate. Ignore if skip is coded at - // segment level as the cost wont have been added in. - // Is Mb level skip allowed (i.e. not coded at segment level). - const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, - SEG_LVL_SKIP); - if (skippable) { + vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); + // Back out the coefficient coding costs rate2 -= (rate_y + rate_uv); // for best yrd calculation rate_uv = 0; - if (mb_skip_allowed) { - int prob_skip_cost; - - // Cost the skip mb case - vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); - if (skip_prob) { - prob_skip_cost = vp9_cost_bit(skip_prob, 1); - rate2 += prob_skip_cost; - } + // Cost the skip mb case + if (skip_prob) { + int prob_skip_cost = vp9_cost_bit(skip_prob, 1); + rate2 += prob_skip_cost; } - } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + } else if (ref_frame != INTRA_FRAME && !xd->lossless) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { // Add in the cost of the no skip flag. @@ -3503,7 +2879,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate_uv = 0; this_skip2 = 1; } - } else if (mb_skip_allowed) { + } else { // Add in the cost of the no skip flag. rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } @@ -3557,8 +2933,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; - if (!x->select_txfm_size) - swap_block_ptr(x, ctx, max_plane); + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 1, 0, 0, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); @@ -3610,21 +2986,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, /* keep record of best filter type */ if (!mode_excluded && cm->interp_filter != BILINEAR) { - int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ? + int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ? SWITCHABLE_FILTERS : cm->interp_filter]; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int64_t adj_rd; if (ref == INT64_MAX) adj_rd = 0; - else if (cpi->rd_filter_cache[i] == INT64_MAX) + else if (rd_opt->filter_cache[i] == INT64_MAX) // when early termination is triggered, the encoder does not have // access to the rate-distortion cost. it only knows that the cost // should be above the maximum valid value. hence it takes the known // maximum plus an arbitrary constant as the rate-distortion cost. - adj_rd = cpi->mask_filter_rd - ref + 10; + adj_rd = rd_opt->mask_filter - ref + 10; else - adj_rd = cpi->rd_filter_cache[i] - ref; + adj_rd = rd_opt->filter_cache[i] - ref; adj_rd += this_rd; best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); @@ -3656,7 +3032,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, break; } - if (best_rd >= best_rd_so_far) + if (best_mode_index < 0 || best_rd >= best_rd_so_far) return INT64_MAX; // If we used an estimate for the uv intra rd in the loop above... @@ -3665,7 +3041,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) { TX_SIZE uv_tx_size; *mbmi = best_mbmode; - uv_tx_size = get_uv_tx_size(mbmi); + uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]); rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], @@ -3679,23 +3055,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, (cm->interp_filter == best_mbmode.interp_filter) || !is_inter_block(&best_mbmode)); - // Updating rd_thresh_freq_fact[] here means that the different - // partition/block sizes are handled independently based on the best - // choice for the current partition. It may well be better to keep a scaled - // best rd so far value and update rd_thresh_freq_fact based on the mode/size - // combination that wins out. - if (cpi->sf.adaptive_rd_thresh) { - for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { - int *const fact = &cpi->rd_thresh_freq_fact[bsize][mode_index]; - - if (mode_index == best_mode_index) { - *fact -= (*fact >> 3); - } else { - *fact = MIN(*fact + RD_THRESH_INC, - cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); - } - } - } + update_rd_thresh_fact(cpi, bsize, best_mode_index); // macroblock modes *mbmi = best_mbmode; @@ -3728,26 +3088,117 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vp9_zero(best_tx_diff); } - if (!x->in_active_map) { - assert(mbmi->ref_frame[0] == LAST_FRAME); - assert(mbmi->ref_frame[1] == NONE); - assert(mbmi->mode == NEARESTMV || - mbmi->mode == NEARMV || - mbmi->mode == ZEROMV); - assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0); - assert(mbmi->mode == mbmi->uv_mode); - } - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); store_coding_context(x, ctx, best_mode_index, - &mbmi->ref_mvs[mbmi->ref_frame[0]][0], - &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : - mbmi->ref_frame[1]][0], best_pred_diff, best_tx_diff, best_filter_diff); return best_rd; } +int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + unsigned char segment_id = mbmi->segment_id; + const int comp_pred = 0; + int i; + int64_t best_tx_diff[TX_MODES]; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; + vp9_prob comp_mode_p; + INTERP_FILTER best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < MAX_REF_FRAMES; ++i) + x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) + x->pred_mv_sad[i] = INT_MAX; + + *returnrate = INT_MAX; + + assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + + mbmi->mode = ZEROMV; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + x->skip = 1; + + // Search for best switchable filter by checking the variance of + // pred error irrespective of whether the filter will be used + rd_opt->mask_filter = 0; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + rd_opt->filter_cache[i] = INT64_MAX; + + if (cm->interp_filter != BILINEAR) { + best_filter = EIGHTTAP; + if (cm->interp_filter == SWITCHABLE && + x->source_variance >= cpi->sf.disable_filter_search_var_thresh) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + mbmi->interp_filter = i; + rs = vp9_get_switchable_rate(cpi); + if (rs < best_rs) { + best_rs = rs; + best_filter = mbmi->interp_filter; + } + } + } + } + // Set the appropriate filter + if (cm->interp_filter == SWITCHABLE) { + mbmi->interp_filter = best_filter; + rate2 += vp9_get_switchable_rate(cpi); + } else { + mbmi->interp_filter = cm->interp_filter; + } + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + rate2 += vp9_cost_bit(comp_mode_p, comp_pred); + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + *returnrate = rate2; + *returndistortion = distortion2; + + if (this_rd >= best_rd_so_far) + return INT64_MAX; + + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == mbmi->interp_filter)); + + update_rd_thresh_fact(cpi, bsize, THR_ZEROMV); + + vp9_zero(best_pred_diff); + vp9_zero(best_filter_diff); + vp9_zero(best_tx_diff); + + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE); + store_coding_context(x, ctx, THR_ZEROMV, + best_pred_diff, best_tx_diff, best_filter_diff); + + return this_rd; +} int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, @@ -3757,10 +3208,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; - const struct segmentation *seg = &cm->seg; + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct segmentation *const seg = &cm->seg; MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; int comp_pred, i; @@ -3770,32 +3222,31 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, VP9_ALT_FLAG }; int64_t best_rd = best_rd_so_far; int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise - int64_t best_tx_rd[TX_MODES]; - int64_t best_tx_diff[TX_MODES]; + static const int64_t best_tx_diff[TX_MODES] = { 0 }; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - MB_MODE_INFO best_mbmode = { 0 }; - int mode_index, best_mode_index = 0; + MB_MODE_INFO best_mbmode; + int ref_index, best_ref_index = 0; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; int64_t best_inter_rd = INT64_MAX; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; INTERP_FILTER tmp_best_filter = SWITCHABLE; - int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; - int64_t dist_uv[TX_SIZES]; - int skip_uv[TX_SIZES]; - MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 }; + int rate_uv_intra, rate_uv_tokenonly; + int64_t dist_uv; + int skip_uv; + PREDICTION_MODE mode_uv = DC_PRED; int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; - int ref_frame_mask = 0; int mode_skip_mask = 0; x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); + vp9_zero(best_mbmode); for (i = 0; i < 4; i++) { int j; @@ -3803,23 +3254,20 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, seg_mvs[i][j].as_int = INVALID_MV; } - estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, &comp_mode_p); for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; - for (i = 0; i < TX_MODES; i++) - best_tx_rd[i] = INT64_MAX; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = INT64_MAX; - for (i = 0; i < TX_SIZES; i++) - rate_uv_intra[i] = INT_MAX; + rate_uv_intra = INT_MAX; *returnrate = INT_MAX; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - vp9_setup_buffer_inter(cpi, x, tile, + setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); @@ -3828,18 +3276,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[ZEROMV][ref_frame].as_int = 0; } - for (ref_frame = LAST_FRAME; - ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) { - int i; - for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { - if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) { - ref_frame_mask |= (1 << ref_frame); - break; - } - } - } - - for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) { + for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) { int mode_excluded = 0; int64_t this_rd = INT64_MAX; int disable_skip = 0; @@ -3847,24 +3284,19 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int rate2 = 0, rate_y = 0, rate_uv = 0; int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable = 0; - int64_t tx_cache[TX_MODES]; int i; int this_skip2 = 0; int64_t total_sse = INT_MAX; int early_term = 0; - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = INT64_MAX; - - x->skip = 0; - ref_frame = vp9_ref_order[mode_index].ref_frame[0]; - second_ref_frame = vp9_ref_order[mode_index].ref_frame[1]; + ref_frame = vp9_ref_order[ref_index].ref_frame[0]; + second_ref_frame = vp9_ref_order[ref_index].ref_frame[1]; // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. - if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) { - if (mode_index == 3) { - switch (vp9_ref_order[best_mode_index].ref_frame[0]) { + if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) { + if (ref_index == 3) { + switch (vp9_ref_order[best_ref_index].ref_frame[0]) { case INTRA_FRAME: mode_skip_mask = 0; break; @@ -3880,84 +3312,55 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, case NONE: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); + break; } } - if (mode_skip_mask & (1 << mode_index)) + if (mode_skip_mask & (1 << ref_index)) continue; } // Test best rd so far against threshold for trying this mode. - if ((best_rd < - ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] * - cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) || - cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX) - continue; - - // Do not allow compound prediction if the segment level reference - // frame feature is in use as in this case there can only be one reference. - if ((second_ref_frame > INTRA_FRAME) && - vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (rd_less_than_thresh(best_rd, + rd_opt->threshes[segment_id][bsize][ref_index], + rd_opt->thresh_freq_fact[bsize][ref_index])) continue; - mbmi->ref_frame[0] = ref_frame; - mbmi->ref_frame[1] = second_ref_frame; - - if (!(ref_frame == INTRA_FRAME - || (cpi->ref_frame_flags & flag_list[ref_frame]))) { - continue; - } - if (!(second_ref_frame == NONE - || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) { + if (ref_frame > INTRA_FRAME && + !(cpi->ref_frame_flags & flag_list[ref_frame])) { continue; } comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) - if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME) - continue; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) - if (ref_frame != best_inter_ref_frame && - second_ref_frame != best_inter_ref_frame) - continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) + continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + continue; + if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) + continue; + if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) && + ref_frame != best_inter_ref_frame && + second_ref_frame != best_inter_ref_frame) + continue; } // TODO(jingning, jkoleszar): scaling reference frame not supported for // sub8x8 blocks. - if (ref_frame > 0 && vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) + if (ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) continue; - if (second_ref_frame > 0 && + if (second_ref_frame > INTRA_FRAME && vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf)) continue; - set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); - mbmi->uv_mode = DC_PRED; - - // Evaluate all sub-pel filters irrespective of whether we can use - // them for this frame. - mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP - : cm->interp_filter; - - if (comp_pred) { - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) - continue; - - mode_excluded = mode_excluded ? mode_excluded - : cm->reference_mode == SINGLE_REFERENCE; - } else { - if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) { - mode_excluded = mode_excluded ? - mode_excluded : cm->reference_mode == COMPOUND_REFERENCE; - } - } - - // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; - if (comp_pred) - xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; - } + if (comp_pred) + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + else if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. @@ -3965,11 +3368,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { continue; - // If the segment skip feature is enabled.... - // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - ref_frame != INTRA_FRAME) { - continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. @@ -3983,15 +3381,26 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, continue; } -#ifdef MODE_TEST_HIT_STATS - // TEST/DEBUG CODE - // Keep a rcord of the number of test hits at each size - cpi->mode_test_hits[bsize]++; -#endif + mbmi->tx_size = TX_4X4; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + // Evaluate all sub-pel filters irrespective of whether we can use + // them for this frame. + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP + : cm->interp_filter; + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) + xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } if (ref_frame == INTRA_FRAME) { int rate; - mbmi->tx_size = TX_4X4; if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y, best_rd) >= best_rd) continue; @@ -3999,21 +3408,18 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, rate2 += intra_cost_penalty; distortion2 += distortion_y; - if (rate_uv_intra[TX_4X4] == INT_MAX) { + if (rate_uv_intra == INT_MAX) { choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4, - &rate_uv_intra[TX_4X4], - &rate_uv_tokenonly[TX_4X4], - &dist_uv[TX_4X4], &skip_uv[TX_4X4], - &mode_uv[TX_4X4]); + &rate_uv_intra, + &rate_uv_tokenonly, + &dist_uv, &skip_uv, + &mode_uv); } - rate2 += rate_uv_intra[TX_4X4]; - rate_uv = rate_uv_tokenonly[TX_4X4]; - distortion2 += dist_uv[TX_4X4]; - distortion_uv = dist_uv[TX_4X4]; - mbmi->uv_mode = mode_uv[TX_4X4]; - tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = tx_cache[ONLY_4X4]; + rate2 += rate_uv_intra; + rate_uv = rate_uv_tokenonly; + distortion2 += dist_uv; + distortion_uv = dist_uv; + mbmi->uv_mode = mode_uv; } else { int rate; int64_t distortion; @@ -4032,20 +3438,17 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int uv_skippable; this_rd_thresh = (ref_frame == LAST_FRAME) ? - cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] : - cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR]; + rd_opt->threshes[segment_id][bsize][THR_LAST] : + rd_opt->threshes[segment_id][bsize][THR_ALTR]; this_rd_thresh = (ref_frame == GOLDEN_FRAME) ? - cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh; - xd->mi[0]->mbmi.tx_size = TX_4X4; - - cpi->mask_filter_rd = 0; + rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh; + rd_opt->mask_filter = 0; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) - cpi->rd_filter_cache[i] = INT64_MAX; + rd_opt->filter_cache[i] = INT64_MAX; if (cm->interp_filter != BILINEAR) { tmp_best_filter = EIGHTTAP; - if (x->source_variance < - cpi->sf.disable_filter_search_var_thresh) { + if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { tmp_best_filter = EIGHTTAP; } else if (cpi->sf.adaptive_pred_interp_filter == 1 && ctx->pred_interp_filter < SWITCHABLE) { @@ -4060,28 +3463,27 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int newbest, rs; int64_t rs_rd; mbmi->interp_filter = switchable_filter_index; - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, - &mbmi->ref_mvs[ref_frame][0], - second_ref, - best_yrd, - &rate, &rate_y, &distortion, - &skippable, &total_sse, - (int)this_rd_thresh, seg_mvs, - bsi, switchable_filter_index, - mi_row, mi_col); + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile, + &mbmi->ref_mvs[ref_frame][0], + second_ref, best_yrd, &rate, + &rate_y, &distortion, + &skippable, &total_sse, + (int) this_rd_thresh, seg_mvs, + bsi, switchable_filter_index, + mi_row, mi_col); if (tmp_rd == INT64_MAX) continue; - rs = vp9_get_switchable_rate(x); + rs = vp9_get_switchable_rate(cpi); rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); - cpi->rd_filter_cache[switchable_filter_index] = tmp_rd; - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], + rd_opt->filter_cache[switchable_filter_index] = tmp_rd; + rd_opt->filter_cache[SWITCHABLE_FILTERS] = + MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd); if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd; - cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, tmp_rd); + rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd); newbest = (tmp_rd < tmp_best_rd); if (newbest) { @@ -4127,15 +3529,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (!pred_exists) { // Handles the special case when a filter that is not in the // switchable list (bilinear, 6-tap) is indicated at the frame level - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, - &mbmi->ref_mvs[ref_frame][0], - second_ref, - best_yrd, - &rate, &rate_y, &distortion, - &skippable, &total_sse, - (int)this_rd_thresh, seg_mvs, - bsi, 0, - mi_row, mi_col); + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile, + &mbmi->ref_mvs[ref_frame][0], + second_ref, best_yrd, &rate, &rate_y, + &distortion, &skippable, &total_sse, + (int) this_rd_thresh, seg_mvs, bsi, 0, + mi_row, mi_col); if (tmp_rd == INT64_MAX) continue; } else { @@ -4153,7 +3552,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, distortion2 += distortion; if (cm->interp_filter == SWITCHABLE) - rate2 += vp9_get_switchable_rate(x); + rate2 += vp9_get_switchable_rate(cpi); if (!mode_excluded) mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE @@ -4178,10 +3577,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, distortion2 += distortion_uv; skippable = skippable && uv_skippable; total_sse += uv_sse; - - tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = tx_cache[ONLY_4X4]; } } @@ -4197,15 +3592,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } if (!disable_skip) { - // Test for the condition where skip block will be activated - // because there are no non zero coefficients and make any - // necessary adjustment for rate. Ignore if skip is coded at - // segment level as the cost wont have been added in. - // Is Mb level skip allowed (i.e. not coded at segment level). - const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, - SEG_LVL_SKIP); - - if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + // Skip is never coded at the segment level for sub8x8 blocks and instead + // always coded in the bitstream at the mode info level. + + if (ref_frame != INTRA_FRAME && !xd->lossless) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { // Add in the cost of the no skip flag. @@ -4220,7 +3610,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, rate_uv = 0; this_skip2 = 1; } - } else if (mb_skip_allowed) { + } else { // Add in the cost of the no skip flag. rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } @@ -4230,8 +3620,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } // Keep record of best inter rd with single reference - if (is_inter_block(&xd->mi[0]->mbmi) && - !has_second_ref(&xd->mi[0]->mbmi) && + if (is_inter_block(mbmi) && + !has_second_ref(mbmi) && !mode_excluded && this_rd < best_inter_rd) { best_inter_rd = this_rd; @@ -4250,7 +3640,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (!mode_excluded) { int max_plane = MAX_MB_PLANE; // Note index of best mode so far - best_mode_index = mode_index; + best_ref_index = ref_index; if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ @@ -4265,9 +3655,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; - if (!x->select_txfm_size) - swap_block_ptr(x, ctx, max_plane); - vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 1, 0, 0, max_plane); + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4], sizeof(uint8_t) * ctx->num_4x4_blk); for (i = 0; i < 4; i++) @@ -4276,7 +3666,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && - (mode_index > MIN_EARLY_TERM_INDEX)) { + (ref_index > MIN_EARLY_TERM_INDEX)) { const int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index int scale = 4; @@ -4307,11 +3697,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); - if (second_ref_frame <= INTRA_FRAME && - single_rd < best_pred_rd[SINGLE_REFERENCE]) { + if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) { best_pred_rd[SINGLE_REFERENCE] = single_rd; - } else if (second_ref_frame > INTRA_FRAME && - single_rd < best_pred_rd[COMPOUND_REFERENCE]) { + } else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) { best_pred_rd[COMPOUND_REFERENCE] = single_rd; } if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) @@ -4321,47 +3709,26 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, /* keep record of best filter type */ if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME && cm->interp_filter != BILINEAR) { - int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ? + int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ? SWITCHABLE_FILTERS : cm->interp_filter]; int64_t adj_rd; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { if (ref == INT64_MAX) adj_rd = 0; - else if (cpi->rd_filter_cache[i] == INT64_MAX) + else if (rd_opt->filter_cache[i] == INT64_MAX) // when early termination is triggered, the encoder does not have // access to the rate-distortion cost. it only knows that the cost // should be above the maximum valid value. hence it takes the known // maximum plus an arbitrary constant as the rate-distortion cost. - adj_rd = cpi->mask_filter_rd - ref + 10; + adj_rd = rd_opt->mask_filter - ref + 10; else - adj_rd = cpi->rd_filter_cache[i] - ref; + adj_rd = rd_opt->filter_cache[i] - ref; adj_rd += this_rd; best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); } } - /* keep record of best txfm size */ - if (bsize < BLOCK_32X32) { - if (bsize < BLOCK_16X16) { - tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4]; - tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8]; - } - tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16]; - } - if (!mode_excluded && this_rd != INT64_MAX) { - for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) { - int64_t adj_rd = INT64_MAX; - if (ref_frame > INTRA_FRAME) - adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode]; - else - adj_rd = this_rd; - - if (adj_rd < best_tx_rd[i]) - best_tx_rd[i] = adj_rd; - } - } - if (early_term) break; @@ -4375,19 +3742,17 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // If we used an estimate for the uv intra rd in the loop above... if (cpi->sf.use_uv_intra_rd_estimate) { // Do Intra UV best rd mode selection if best mode choice above was intra. - if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME) { - TX_SIZE uv_tx_size; + if (vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) { *mbmi = best_mbmode; - uv_tx_size = get_uv_tx_size(mbmi); - rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], - &rate_uv_tokenonly[uv_tx_size], - &dist_uv[uv_tx_size], - &skip_uv[uv_tx_size], - BLOCK_8X8, uv_tx_size); + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra, + &rate_uv_tokenonly, + &dist_uv, + &skip_uv, + BLOCK_8X8, TX_4X4); } } - if (best_rd == INT64_MAX && bsize < BLOCK_8X8) { + if (best_rd == INT64_MAX) { *returnrate = INT_MAX; *returndistortion = INT64_MAX; return best_rd; @@ -4397,23 +3762,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, (cm->interp_filter == best_mbmode.interp_filter) || !is_inter_block(&best_mbmode)); - // Updating rd_thresh_freq_fact[] here means that the different - // partition/block sizes are handled independently based on the best - // choice for the current partition. It may well be better to keep a scaled - // best rd so far value and update rd_thresh_freq_fact based on the mode/size - // combination that wins out. - if (cpi->sf.adaptive_rd_thresh) { - for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) { - int *const fact = &cpi->rd_thresh_freq_sub8x8[bsize][mode_index]; - - if (mode_index == best_mode_index) { - *fact -= (*fact >> 3); - } else { - *fact = MIN(*fact + RD_THRESH_INC, - cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); - } - } - } + update_rd_thresh_fact(cpi, bsize, best_ref_index); // macroblock modes *mbmi = best_mbmode; @@ -4449,22 +3798,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_zero(best_filter_diff); } - if (!x->skip) { - for (i = 0; i < TX_MODES; i++) { - if (best_tx_rd[i] == INT64_MAX) - best_tx_diff[i] = 0; - else - best_tx_diff[i] = best_rd - best_tx_rd[i]; - } - } else { - vp9_zero(best_tx_diff); - } - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - store_coding_context(x, ctx, best_mode_index, - &mbmi->ref_mvs[mbmi->ref_frame[0]][0], - &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : - mbmi->ref_frame[1]][0], + store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_tx_diff, best_filter_diff); return best_rd; diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h index a01dbd4d3..52c603fb6 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.h +++ b/libvpx/vp9/encoder/vp9_rdopt.h @@ -11,54 +11,24 @@ #ifndef VP9_ENCODER_VP9_RDOPT_H_ #define VP9_ENCODER_VP9_RDOPT_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/common/vp9_blockd.h" + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_context_tree.h" #ifdef __cplusplus extern "C" { #endif -#define RDDIV_BITS 7 - -#define RDCOST(RM, DM, R, D) \ - (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) -#define QIDX_SKIP_THRESH 115 - -#define MV_COST_WEIGHT 108 -#define MV_COST_WEIGHT_SUB 120 - -#define INVALID_MV 0x80008000 - struct TileInfo; +struct VP9_COMP; +struct macroblock; -int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex); - -void vp9_initialize_rd_consts(VP9_COMP *cpi); - -void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); - -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, - unsigned int qstep, int *rate, - int64_t *dist); - -int vp9_get_switchable_rate(const MACROBLOCK *x); - -void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - MV_REFERENCE_FRAME ref_frame, - BLOCK_SIZE block_size, - int mi_row, int mi_col, - int_mv frame_nearest_mv[MAX_REF_FRAMES], - int_mv frame_near_mv[MAX_REF_FRAMES], - struct buf_2d yv12_mb[4][MAX_MB_PLANE]); - -const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, - int ref_frame); - -void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, +void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, int *r, int64_t *d, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); -int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, +int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, const struct TileInfo *const tile, int mi_row, int mi_col, int *returnrate, @@ -67,7 +37,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); -int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, +int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi, + struct macroblock *x, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, + struct macroblock *x, const struct TileInfo *const tile, int mi_row, int mi_col, int *returnrate, @@ -76,13 +55,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); -void vp9_init_me_luts(); - -void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, - const struct macroblockd_plane *pd, - ENTROPY_CONTEXT t_above[16], - ENTROPY_CONTEXT t_left[16]); - #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_sad.c b/libvpx/vp9/encoder/vp9_sad.c index 9d8da0da4..d06263676 100644 --- a/libvpx/vp9/encoder/vp9_sad.c +++ b/libvpx/vp9/encoder/vp9_sad.c @@ -33,292 +33,101 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, return sad; } -#define sad_mxn_func(m, n) \ -unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int max_sad) { \ - return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ +#define sadMxN(m, n) \ +unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ } \ -unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - const uint8_t *second_pred, \ - unsigned int max_sad) { \ +unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ uint8_t comp_pred[m * n]; \ - vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ - return sad(src_ptr, src_stride, comp_pred, m, m, n); \ -} - -sad_mxn_func(64, 64) -sad_mxn_func(64, 32) -sad_mxn_func(32, 64) -sad_mxn_func(32, 32) -sad_mxn_func(32, 16) -sad_mxn_func(16, 32) -sad_mxn_func(16, 16) -sad_mxn_func(16, 8) -sad_mxn_func(8, 16) -sad_mxn_func(8, 8) -sad_mxn_func(8, 4) -sad_mxn_func(4, 8) -sad_mxn_func(4, 4) - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad64x32(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad32x64(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad32x16(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad16x32(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 3; ++i) - sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad8x4(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad8x4(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad4x8(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { - int i; - for (i = 0; i < 8; ++i) - sad_array[i] = vp9_sad4x8(src_ptr, src_stride, ref_ptr + i, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, - const uint8_t* const ref_ptr[], int ref_stride, - unsigned int *sad_array) { - int i; - for (i = 0; i < 4; ++i) - sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr[i], ref_stride, - 0x7fffffff); -} + vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ +} + +#define sadMxNxK(m, n, k) \ +void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sads) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \ +} + +#define sadMxNx4D(m, n) \ +void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const refs[], int ref_stride, \ + unsigned int *sads) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \ +} + +// 64x64 +sadMxN(64, 64) +sadMxNxK(64, 64, 3) +sadMxNxK(64, 64, 8) +sadMxNx4D(64, 64) + +// 64x32 +sadMxN(64, 32) +sadMxNx4D(64, 32) + +// 32x64 +sadMxN(32, 64) +sadMxNx4D(32, 64) + +// 32x32 +sadMxN(32, 32) +sadMxNxK(32, 32, 3) +sadMxNxK(32, 32, 8) +sadMxNx4D(32, 32) + +// 32x16 +sadMxN(32, 16) +sadMxNx4D(32, 16) + +// 16x32 +sadMxN(16, 32) +sadMxNx4D(16, 32) + +// 16x16 +sadMxN(16, 16) +sadMxNxK(16, 16, 3) +sadMxNxK(16, 16, 8) +sadMxNx4D(16, 16) + +// 16x8 +sadMxN(16, 8) +sadMxNxK(16, 8, 3) +sadMxNxK(16, 8, 8) +sadMxNx4D(16, 8) + +// 8x16 +sadMxN(8, 16) +sadMxNxK(8, 16, 3) +sadMxNxK(8, 16, 8) +sadMxNx4D(8, 16) + +// 8x8 +sadMxN(8, 8) +sadMxNxK(8, 8, 3) +sadMxNxK(8, 8, 8) +sadMxNx4D(8, 8) + +// 8x4 +sadMxN(8, 4) +sadMxNxK(8, 4, 8) +sadMxNx4D(8, 4) + +// 4x8 +sadMxN(4, 8) +sadMxNxK(4, 8, 8) +sadMxNx4D(4, 8) + +// 4x4 +sadMxN(4, 4) +sadMxNxK(4, 4, 3) +sadMxNxK(4, 4, 8) +sadMxNx4D(4, 4) diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c index 9d3e6dc12..d5676c3d1 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libvpx/vp9/encoder/vp9_segmentation.c @@ -27,18 +27,8 @@ void vp9_enable_segmentation(struct segmentation *seg) { void vp9_disable_segmentation(struct segmentation *seg) { seg->enabled = 0; -} - -void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map) { - struct segmentation *const seg = &cpi->common.seg; - - // Copy in the new segmentation map - vpx_memcpy(cpi->segmentation_map, segmentation_map, - (cpi->common.mi_rows * cpi->common.mi_cols)); - - // Signal that the map should be updated. - seg->update_map = 1; - seg->update_data = 1; + seg->update_map = 0; + seg->update_data = 0; } void vp9_set_segment_data(struct segmentation *seg, @@ -120,20 +110,18 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, +static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int bw, int bh, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; int segment_id; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - xd->mi = mi_8x8; + xd->mi = mi; segment_id = xd->mi[0]->mbmi.segment_id; set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); @@ -143,7 +131,7 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { - const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; // Test to see if the segment id matches the predicted value. const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col); @@ -155,20 +143,19 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, xd->mi[0]->mbmi.seg_id_predicted = pred_flag; temporal_predictor_count[pred_context][pred_flag]++; + // Update the "unpredicted" segment count if (!pred_flag) - // Update the "unpredicted" segment count t_unpred_seg_counts[segment_id]++; } } -static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, +static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int mi_row, int mi_col, BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; const int mis = cm->mi_stride; int bw, bh; const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; @@ -176,22 +163,22 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type]; - bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; + bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type]; + bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type]; if (bw == bs && bh == bs) { - count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, bs, mi_row, mi_col); } else if (bw == bs && bh < bs) { - count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row, mi_col); - count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts, + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col); } else if (bw < bs && bh == bs) { - count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col); - count_segs(cpi, tile, mi_8x8 + hbs, + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs); } else { @@ -204,7 +191,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, const int mi_dc = hbs * (n & 1); const int mi_dr = hbs * (n >> 1); - count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc], + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row + mi_dr, mi_col + mi_dc, subsize); @@ -212,8 +199,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, } } -void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) { struct segmentation *seg = &cm->seg; int no_pred_cost; @@ -229,9 +215,6 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { vp9_prob t_pred_tree[SEG_TREE_PROBS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; - const int mis = cm->mi_stride; - MODE_INFO **mi_ptr, **mi; - // Set default state for the segment tree probabilities and the // temporal coding probabilities vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); @@ -241,15 +224,16 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // predicts this one for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { TileInfo tile; - + MODE_INFO **mi_ptr; vp9_tile_init(&tile, cm, 0, tile_col); + mi_ptr = cm->mi_grid_visible + tile.mi_col_start; for (mi_row = 0; mi_row < cm->mi_rows; - mi_row += 8, mi_ptr += 8 * mis) { - mi = mi_ptr; + mi_row += 8, mi_ptr += 8 * cm->mi_stride) { + MODE_INFO **mi = mi_ptr; for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += 8, mi += 8) - count_segs_sb(cpi, &tile, mi, no_pred_segcounts, + count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64); } diff --git a/libvpx/vp9/encoder/vp9_segmentation.h b/libvpx/vp9/encoder/vp9_segmentation.h index 66c51a21b..8c6944ad1 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.h +++ b/libvpx/vp9/encoder/vp9_segmentation.h @@ -13,7 +13,7 @@ #define VP9_ENCODER_VP9_SEGMENTATION_H_ #include "vp9/common/vp9_blockd.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #ifdef __cplusplus extern "C" { @@ -28,9 +28,6 @@ void vp9_disable_segfeature(struct segmentation *seg, void vp9_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -// Valid values for a segment are 0 to 3 -// Segmentation map is arrange as [Rows][Columns] -void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map); // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. @@ -45,7 +42,7 @@ void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map); void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, unsigned char abs_delta); -void vp9_choose_segmap_coding_method(VP9_COMP *cpi); +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd); void vp9_reset_segment_features(struct segmentation *seg); diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c index d6b6174fa..4fe3aac1f 100644 --- a/libvpx/vp9/encoder/vp9_speed_features.c +++ b/libvpx/vp9/encoder/vp9_speed_features.c @@ -10,35 +10,45 @@ #include <limits.h> -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_speed_features.h" -#define ALL_INTRA_MODES ((1 << DC_PRED) | \ - (1 << V_PRED) | (1 << H_PRED) | \ - (1 << D45_PRED) | (1 << D135_PRED) | \ - (1 << D117_PRED) | (1 << D153_PRED) | \ - (1 << D207_PRED) | (1 << D63_PRED) | \ - (1 << TM_PRED)) -#define INTRA_DC_ONLY (1 << DC_PRED) -#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED)) -#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED)) -#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)) - -// Masks for partially or completely disabling split mode -#define DISABLE_ALL_INTER_SPLIT ((1 << THR_COMP_GA) | \ - (1 << THR_COMP_LA) | \ - (1 << THR_ALTR) | \ - (1 << THR_GOLD) | \ - (1 << THR_LAST)) - -#define DISABLE_ALL_SPLIT ((1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT) - -#define DISABLE_COMPOUND_SPLIT ((1 << THR_COMP_GA) | (1 << THR_COMP_LA)) - -#define LAST_AND_INTRA_SPLIT_ONLY ((1 << THR_COMP_GA) | \ - (1 << THR_COMP_LA) | \ - (1 << THR_ALTR) | \ - (1 << THR_GOLD)) +enum { + INTRA_ALL = (1 << DC_PRED) | + (1 << V_PRED) | (1 << H_PRED) | + (1 << D45_PRED) | (1 << D135_PRED) | + (1 << D117_PRED) | (1 << D153_PRED) | + (1 << D207_PRED) | (1 << D63_PRED) | + (1 << TM_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | + (1 << H_PRED) +}; + +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) +}; + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | + (1 << THR_COMP_LA) | + (1 << THR_ALTR) | + (1 << THR_GOLD) | + (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | + (1 << THR_COMP_LA) | + (1 << THR_ALTR) | + (1 << THR_GOLD) +}; static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, SPEED_FEATURES *sf, int speed) { @@ -49,8 +59,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, if (speed >= 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; - sf->tx_size_search_method = vp9_frame_is_boosted(cpi) ? USE_FULL_RD - : USE_LARGESTALL; + sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD + : USE_LARGESTALL; if (MIN(cm->width, cm->height) >= 720) sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT @@ -59,9 +69,9 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; - sf->auto_mv_step_size = 1; + sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; - sf->subpel_iters_per_step = 1; + sf->mv.subpel_iters_per_step = 1; sf->mode_skip_start = 10; sf->adaptive_pred_interp_filter = 1; @@ -73,16 +83,18 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, } if (speed >= 2) { - sf->tx_size_search_method = vp9_frame_is_boosted(cpi) ? USE_FULL_RD - : USE_LARGESTALL; - - if (MIN(cm->width, cm->height) >= 720) + if (MIN(cm->width, cm->height) >= 720) { + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; + sf->last_partitioning_redo_frequency = 3; sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - else + sf->adaptive_pred_interp_filter = 0; + } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + sf->last_partitioning_redo_frequency = 2; + sf->lf_motion_threshold = NO_MOTION_THRESHOLD; + } - sf->adaptive_pred_interp_filter = 2; sf->reference_masking = 1; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | @@ -93,20 +105,26 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; sf->adjust_partitioning_from_last_frame = 1; - sf->last_partitioning_redo_frequency = 3; } if (speed >= 3) { + sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD + : USE_LARGESTALL; if (MIN(cm->width, cm->height) >= 720) sf->disable_split_mask = DISABLE_ALL_SPLIT; else sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; + sf->adaptive_pred_interp_filter = 0; + sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1; + sf->cb_pred_filter_search = 1; + sf->motion_field_mode_search = frame_is_boosted(cpi) ? 0 : 1; + + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; + sf->last_partitioning_redo_frequency = 3; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; - sf->use_fast_coef_updates = ONE_LOOP_REDUCED; - sf->use_fast_coef_costing = 1; } if (speed >= 4) { @@ -119,6 +137,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->disable_filter_search_var_thresh = 200; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; sf->use_lp32x32fdct = 1; + sf->use_fast_coef_updates = ONE_LOOP_REDUCED; + sf->use_fast_coef_costing = 1; } if (speed >= 5) { @@ -126,24 +146,29 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->partition_search_type = FIXED_PARTITION; sf->optimize_coefficients = 0; - sf->search_method = HEX; + sf->mv.search_method = HEX; sf->disable_filter_search_var_thresh = 500; for (i = 0; i < TX_SIZES; ++i) { - sf->intra_y_mode_mask[i] = INTRA_DC_ONLY; - sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; + sf->intra_y_mode_mask[i] = INTRA_DC; + sf->intra_uv_mode_mask[i] = INTRA_DC; } cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; } + if (speed >= 6) { + sf->mv.reduce_first_step_size = 1; + } } -static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf, - int speed) { +static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, + int speed, vp9e_tune_content content) { + VP9_COMMON *const cm = &cpi->common; + const int frames_since_key = + cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; - sf->encode_breakout_thresh = 1; sf->use_fast_coef_costing = 1; - if (speed == 1) { + if (speed >= 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD @@ -158,22 +183,17 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf, sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; sf->adaptive_pred_interp_filter = 1; - sf->auto_mv_step_size = 1; + sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->encode_breakout_thresh = 8; } if (speed >= 2) { - sf->use_square_partition_only = !frame_is_intra_only(cm); - sf->less_rectangular_check = 1; - sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD - : USE_LARGESTALL; if (MIN(cm->width, cm->height) >= 720) - sf->disable_split_mask = cm->show_frame ? - DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT + : DISABLE_ALL_INTER_SPLIT; else sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; @@ -181,28 +201,18 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf, FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; - sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; sf->adaptive_pred_interp_filter = 2; - sf->auto_mv_step_size = 1; sf->reference_masking = 1; - sf->disable_filter_search_var_thresh = 50; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; sf->adjust_partitioning_from_last_frame = 1; sf->last_partitioning_redo_frequency = 3; - - sf->adaptive_rd_thresh = 2; sf->use_lp32x32fdct = 1; sf->mode_skip_start = 11; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->encode_breakout_thresh = 200; } if (speed >= 3) { @@ -212,7 +222,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf, sf->constrain_copy_partition = 1; sf->use_uv_intra_rd_estimate = 1; sf->skip_encode_sb = 1; - sf->subpel_iters_per_step = 1; + sf->mv.subpel_iters_per_step = 1; sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; @@ -220,7 +230,6 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf, sf->optimize_coefficients = 0; sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->lpf_pick = LPF_PICK_FROM_Q; - sf->encode_breakout_thresh = 700; } if (speed >= 4) { @@ -231,74 +240,100 @@ static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf, sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX; sf->adjust_partitioning_from_last_frame = cm->last_frame_type != cm->frame_type || (0 == - (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency); - sf->subpel_force_stop = 1; + (frames_since_key + 1) % sf->last_partitioning_redo_frequency); + sf->mv.subpel_force_stop = 1; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; + sf->intra_uv_mode_mask[i] = INTRA_DC; } - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->frame_parameter_update = 0; - sf->encode_breakout_thresh = 1000; - sf->search_method = FAST_HEX; - sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV); - sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV)); - sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV)); - sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV)); + sf->mv.search_method = FAST_HEX; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; sf->max_intra_bsize = BLOCK_32X32; sf->allow_skip_recode = 1; } if (speed >= 5) { + sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1; + sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ? + RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; sf->max_partition_size = BLOCK_32X32; sf->min_partition_size = BLOCK_8X8; sf->partition_check = - (cm->current_video_frame % sf->last_partitioning_redo_frequency == 1); + (frames_since_key % sf->last_partitioning_redo_frequency == 1); sf->force_frame_boost = cm->frame_type == KEY_FRAME || - (cm->current_video_frame % + (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15; sf->partition_search_type = REFERENCE_PARTITION; sf->use_nonrd_pick_mode = 1; - sf->search_method = FAST_DIAMOND; sf->allow_skip_recode = 0; } if (speed >= 6) { + if (content == VP9E_CONTENT_SCREEN) { + int i; + // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used + for (i = 0; i < BLOCK_SIZES; ++i) + sf->inter_mode_mask[i] = INTER_ALL; + } + // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION. sf->partition_search_type = SOURCE_VAR_BASED_PARTITION; sf->search_type_check_frequency = 50; - sf->source_var_thresh = 360; - sf->use_nonrd_pick_mode = 1; - sf->search_method = FAST_DIAMOND; - } + sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ? + USE_LARGESTALL : USE_TX_8X8; + + // This feature is only enabled when partition search is disabled. + sf->reuse_inter_pred_sby = 1; + + // Increase mode checking threshold for NEWMV. + sf->elevate_newmv_thresh = 2000; + sf->mv.reduce_first_step_size = 1; + } if (speed >= 7) { + sf->mv.search_method = FAST_DIAMOND; + sf->mv.fullpel_search_step_param = 10; + sf->lpf_pick = LPF_PICK_MINIMAL_LPF; + sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ? + 800 : 300; + sf->elevate_newmv_thresh = 2500; + } + if (speed >= 12) { + sf->elevate_newmv_thresh = 4000; + sf->mv.subpel_force_stop = 2; + } + if (speed >= 13) { int i; + sf->max_intra_bsize = BLOCK_32X32; for (i = 0; i < BLOCK_SIZES; ++i) - sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV)); + sf->inter_mode_mask[i] = INTER_NEAREST; } } void vp9_set_speed_features(VP9_COMP *cpi) { SPEED_FEATURES *const sf = &cpi->sf; VP9_COMMON *const cm = &cpi->common; - const VP9_CONFIG *const oxcf = &cpi->oxcf; - const int speed = cpi->speed < 0 ? -cpi->speed : cpi->speed; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; int i; // best quality defaults sf->frame_parameter_update = 1; - sf->search_method = NSTEP; + sf->mv.search_method = NSTEP; sf->recode_loop = ALLOW_RECODE; - sf->subpel_search_method = SUBPEL_TREE; - sf->subpel_iters_per_step = 2; - sf->subpel_force_stop = 0; - sf->optimize_coefficients = !oxcf->lossless; - sf->reduce_first_step_size = 0; - sf->auto_mv_step_size = 0; - sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + sf->mv.subpel_search_method = SUBPEL_TREE; + sf->mv.subpel_iters_per_step = 2; + sf->mv.subpel_force_stop = 0; + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); + sf->mv.reduce_first_step_size = 0; + sf->mv.auto_mv_step_size = 0; + sf->mv.fullpel_search_step_param = 6; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF; @@ -306,6 +341,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; + sf->cb_pred_filter_search = 0; + sf->cb_partition_search = 0; + sf->motion_field_mode_search = 0; + sf->use_quant_fp = 0; sf->reference_masking = 0; sf->partition_search_type = SEARCH_PARTITION; sf->less_rectangular_check = 0; @@ -323,8 +362,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->disable_split_var_thresh = 0; sf->disable_filter_search_var_thresh = 0; for (i = 0; i < TX_SIZES; i++) { - sf->intra_y_mode_mask[i] = ALL_INTRA_MODES; - sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES; + sf->intra_y_mode_mask[i] = INTRA_ALL; + sf->intra_uv_mode_mask[i] = INTRA_ALL; } sf->use_rd_breakout = 0; sf->skip_encode_sb = 0; @@ -335,55 +374,51 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_fast_coef_costing = 0; sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set sf->use_nonrd_pick_mode = 0; - sf->encode_breakout_thresh = 0; for (i = 0; i < BLOCK_SIZES; ++i) - sf->disable_inter_mode_mask[i] = 0; + sf->inter_mode_mask[i] = INTER_ALL; sf->max_intra_bsize = BLOCK_64X64; + sf->reuse_inter_pred_sby = 0; // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. sf->always_this_block_size = BLOCK_16X16; sf->search_type_check_frequency = 50; - sf->source_var_thresh = 100; - + sf->encode_breakout_thresh = 0; + sf->elevate_newmv_thresh = 0; // Recode loop tolerence %. sf->recode_tolerance = 25; + sf->default_interp_filter = SWITCHABLE; switch (oxcf->mode) { - case MODE_BESTQUALITY: - case MODE_SECONDPASS_BEST: // This is the best quality mode. + case ONE_PASS_BEST: + case TWO_PASS_SECOND_BEST: // This is the best quality mode. cpi->diamond_search_sad = vp9_full_range_search; break; - case MODE_FIRSTPASS: - case MODE_GOODQUALITY: - case MODE_SECONDPASS: - set_good_speed_feature(cpi, cm, sf, speed); + case TWO_PASS_FIRST: + case ONE_PASS_GOOD: + case TWO_PASS_SECOND_GOOD: + set_good_speed_feature(cpi, cm, sf, oxcf->speed); break; - case MODE_REALTIME: - set_rt_speed_feature(cm, sf, speed); + case REALTIME: + set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content); break; } // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. - if (cpi->pass == 1) + if (oxcf->pass == 1) sf->optimize_coefficients = 0; // No recode for 1 pass. - if (cpi->pass == 0) { + if (oxcf->pass == 0) { sf->recode_loop = DISALLOW_RECODE; sf->optimize_coefficients = 0; } - if (sf->subpel_search_method == SUBPEL_TREE) { + if (sf->mv.subpel_search_method == SUBPEL_TREE) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; - cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree; } - cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1; - - if (cpi->encode_breakout && oxcf->mode == MODE_REALTIME && - sf->encode_breakout_thresh > cpi->encode_breakout) - cpi->encode_breakout = sf->encode_breakout_thresh; + cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; if (sf->disable_split_mask == DISABLE_ALL_SPLIT) sf->adaptive_pred_interp_filter = 0; @@ -391,4 +426,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { if (!cpi->oxcf.frame_periodic_boost) { sf->max_delta_qindex = 0; } + + if (cpi->encode_breakout && oxcf->mode == REALTIME && + sf->encode_breakout_thresh > cpi->encode_breakout) + cpi->encode_breakout = sf->encode_breakout_thresh; } diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h index 72f548a04..243139d7b 100644 --- a/libvpx/vp9/encoder/vp9_speed_features.h +++ b/libvpx/vp9/encoder/vp9_speed_features.h @@ -44,6 +44,11 @@ typedef enum { } SUBPEL_SEARCH_METHODS; typedef enum { + NO_MOTION_THRESHOLD = 0, + LOW_MOTION_THRESHOLD = 7 +} MOTION_THRESHOLD; + +typedef enum { LAST_FRAME_PARTITION_OFF = 0, LAST_FRAME_PARTITION_LOW_MOTION = 1, LAST_FRAME_PARTITION_ALL = 2 @@ -51,9 +56,8 @@ typedef enum { typedef enum { USE_FULL_RD = 0, - USE_LARGESTINTRA, - USE_LARGESTINTRA_MODELINTER, - USE_LARGESTALL + USE_LARGESTALL, + USE_TX_8X8 } TX_SIZE_SEARCH_METHOD; typedef enum { @@ -69,6 +73,8 @@ typedef enum { LPF_PICK_FROM_SUBIMAGE, // Estimate the level based on quantizer and frame type LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF } LPF_PICK_METHOD; typedef enum { @@ -129,14 +135,17 @@ typedef enum { ONE_LOOP_REDUCED = 2 } FAST_COEFF_UPDATE; -typedef struct { - // Frame level coding parameter update - int frame_parameter_update; - +typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; - RECODE_LOOP_TYPE recode_loop; + // This parameter controls which step in the n-step process we start at. + // It's changed adaptively based on circumstances. + int reduce_first_step_size; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; // Subpel_search_method can only be subpel_tree which does a subpixel // logarithmic search that keeps stepping at 1/2 pixel units until @@ -150,17 +159,17 @@ typedef struct { // Control when to stop subpel search int subpel_force_stop; - // This parameter controls the number of steps we'll do in a diamond - // search. - int max_step_search_steps; + // This variable sets the step_param used in full pel motion search. + int fullpel_search_step_param; +} MV_SPEED_FEATURES; - // This parameter controls which step in the n-step process we start at. - // It's changed adaptively based on circumstances. - int reduce_first_step_size; +typedef struct SPEED_FEATURES { + MV_SPEED_FEATURES mv; - // If this is set to 1, we limit the motion search range to 2 times the - // largest motion vector found in the last frame. - int auto_mv_step_size; + // Frame level coding parameter update + int frame_parameter_update; + + RECODE_LOOP_TYPE recode_loop; // Trellis (dynamic programming) optimization of quantized values (+1, 0). int optimize_coefficients; @@ -176,7 +185,7 @@ typedef struct { // a log search that iterates 4 times (check around mv for last for best // error of combined predictor then check around mv for alt). If 0 we // we just use the best motion vector found for each frame by itself. - int comp_inter_joint_search_thresh; + BLOCK_SIZE comp_inter_joint_search_thresh; // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. @@ -200,6 +209,10 @@ typedef struct { // partitioning. LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning; + // The threshold is to determine how slow the motino is, it is used when + // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION + MOTION_THRESHOLD lf_motion_threshold; + // Determine which method we use to determine transform size. We can choose // between options like full rd, largest for prediction size, largest // for intra and model coefs for the rest. @@ -270,6 +283,16 @@ typedef struct { // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. int adaptive_pred_interp_filter; + // Chessboard pattern prediction filter type search + int cb_pred_filter_search; + + int cb_partition_search; + + int motion_field_mode_search; + + // Fast quantization process path + int use_quant_fp; + // Search through variable block partition types in non-RD mode decision // encoding process for RTC. int partition_check; @@ -318,13 +341,9 @@ typedef struct { // This flag controls the use of non-RD mode decision. int use_nonrd_pick_mode; - // This variable sets the encode_breakout threshold. Currently, it is only - // enabled in real time mode. - int encode_breakout_thresh; - // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV - // modes are disabled in order from LSB to MSB for each BLOCK_SIZE. - int disable_inter_mode_mask[BLOCK_SIZES]; + // modes are used in order from LSB to MSB for each BLOCK_SIZE. + int inter_mode_mask[BLOCK_SIZES]; // This feature controls whether we do the expensive context update and // calculation in the rd coefficient costing loop. @@ -343,8 +362,20 @@ typedef struct { // FIXED_PARTITION search type should be used. int search_type_check_frequency; - // The threshold used in SOURCE_VAR_BASED_PARTITION search type. - int source_var_thresh; + // When partition is pre-set, the inter prediction result from pick_inter_mode + // can be reused in final block encoding process. It is enabled only for real- + // time mode speed 6. + int reuse_inter_pred_sby; + + // This variable sets the encode_breakout threshold. Currently, it is only + // enabled in real time mode. + int encode_breakout_thresh; + + // In real time encoding, increase the threshold for NEWMV. + int elevate_newmv_thresh; + + // default interp filter choice + INTERP_FILTER default_interp_filter; } SPEED_FEATURES; struct VP9_COMP; diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c index 9796d6476..530b5923b 100644 --- a/libvpx/vp9/encoder/vp9_subexp.c +++ b/libvpx/vp9/encoder/vp9_subexp.c @@ -16,7 +16,24 @@ #define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd))) -static int update_bits[255]; +static const int update_bits[255] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, +}; static int recenter_nonneg(int v, int m) { if (v > (m << 1)) @@ -61,18 +78,6 @@ static int remap_prob(int v, int m) { return i; } -static int count_term_subexp(int word) { - if (word < 16) - return 5; - if (word < 32) - return 6; - if (word < 64) - return 8; - if (word < 129) - return 10; - return 11; -} - static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) { int delp = remap_prob(newp, oldp); return update_bits[delp] * 256; @@ -111,12 +116,6 @@ void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) { encode_term_subexp(w, delp); } -void vp9_compute_update_table() { - int i; - for (i = 0; i < 254; i++) - update_bits[i] = count_term_subexp(i); -} - int vp9_prob_diff_update_savings_search(const unsigned int *ct, vp9_prob oldp, vp9_prob *bestp, vp9_prob upd) { diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h index 8e9c0c62a..8e02a1d0d 100644 --- a/libvpx/vp9/encoder/vp9_subexp.h +++ b/libvpx/vp9/encoder/vp9_subexp.h @@ -16,9 +16,6 @@ extern "C" { #endif -void vp9_compute_update_table(); - - void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp); diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c index c2b6263f0..bf949c456 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -10,14 +10,16 @@ #include <math.h> -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_svc_layercontext.h" +#include "vp9/encoder/vp9_extend.h" void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; int layer; int layer_end; + int alt_ref_idx = svc->number_spatial_layers; svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; @@ -31,8 +33,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { for (layer = 0; layer < layer_end; ++layer) { LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; + int i; lc->current_video_frame_in_layer = 0; - lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + lc->layer_size = 0; lrc->ni_av_qi = oxcf->worst_allowed_q; lrc->total_actual_bits = 0; lrc->total_target_vs_actual = 0; @@ -42,30 +45,45 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lrc->ni_frames = 0; lrc->decimation_count = 0; lrc->decimation_factor = 0; - lrc->rate_correction_factor = 1.0; - lrc->key_frame_rate_correction_factor = 1.0; + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lrc->rate_correction_factors[i] = 1.0; + } if (svc->number_temporal_layers > 1) { - lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000; + lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; } else { - lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000; - lrc->last_q[0] = oxcf->best_allowed_q; - lrc->last_q[1] = oxcf->best_allowed_q; - lrc->last_q[2] = oxcf->best_allowed_q; + lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; + lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q; + lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + if (oxcf->ss_play_alternate[layer]) + lc->alt_ref_idx = alt_ref_idx++; + else + lc->alt_ref_idx = -1; + lc->gold_ref_idx = -1; } - lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level), + lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms), lc->target_bandwidth, 1000); lrc->bits_off_target = lrc->buffer_level; } + + // Still have extra buffer for base layer golden frame + if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES) + svc->layer_context[0].gold_ref_idx = alt_ref_idx; } // Update the layer context from a change_config() call. void vp9_update_layer_context_change_config(VP9_COMP *const cpi, const int target_bandwidth) { SVC *const svc = &cpi->svc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; const RATE_CONTROL *const rc = &cpi->rc; int layer; int layer_end; @@ -82,27 +100,27 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; if (svc->number_temporal_layers > 1) { - lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000; + lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; } else { - lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000; + lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; } bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; // Update buffer-related quantities. - lc->starting_buffer_level = - (int64_t)(oxcf->starting_buffer_level * bitrate_alloc); - lc->optimal_buffer_level = - (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc); - lc->maximum_buffer_size = - (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc); - lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size); - lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size); + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); // Update framerate-related quantities. if (svc->number_temporal_layers > 1) { lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer]; } else { lc->framerate = oxcf->framerate; } - lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; // Update qp-related quantities. lrc->worst_quality = rc->worst_quality; @@ -118,22 +136,21 @@ static LAYER_CONTEXT *get_layer_context(SVC *svc) { void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; LAYER_CONTEXT *const lc = get_layer_context(svc); RATE_CONTROL *const lrc = &lc->rc; const int layer = svc->temporal_layer_id; lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer]; - lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). if (layer == 0) { - lc->avg_frame_size = lrc->av_per_frame_bandwidth; + lc->avg_frame_size = lrc->avg_frame_bandwidth; } else { const double prev_layer_framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer - 1]; - const int prev_layer_target_bandwidth = - oxcf->ts_target_bitrate[layer - 1] * 1000; + const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1]; lc->avg_frame_size = (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / (lc->framerate - prev_layer_framerate)); @@ -141,30 +158,17 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { } void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc); RATE_CONTROL *const lrc = &lc->rc; lc->framerate = framerate; - lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); - lrc->min_frame_bandwidth = (int)(lrc->av_per_frame_bandwidth * + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); - lrc->max_frame_bandwidth = (int)(((int64_t)lrc->av_per_frame_bandwidth * + lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / 100); - lrc->max_gf_interval = 16; - - lrc->static_scene_max_gf_interval = cpi->key_frame_frequency >> 1; - - if (oxcf->play_alternate && oxcf->lag_in_frames) { - if (lrc->max_gf_interval > oxcf->lag_in_frames - 1) - lrc->max_gf_interval = oxcf->lag_in_frames - 1; - - if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; - } - - if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval) - lrc->max_gf_interval = lrc->static_scene_max_gf_interval; + vp9_rc_set_gf_max_interval(cpi, lrc); } void vp9_restore_layer_context(VP9_COMP *const cpi) { @@ -175,10 +179,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { cpi->rc = lc->rc; cpi->twopass = lc->twopass; cpi->oxcf.target_bandwidth = lc->target_bandwidth; - cpi->oxcf.starting_buffer_level = lc->starting_buffer_level; - cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level; - cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size; - cpi->output_framerate = lc->framerate; + cpi->alt_ref_source = lc->alt_ref_source; // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1) { @@ -188,16 +189,13 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { } void vp9_save_layer_context(VP9_COMP *const cpi) { - const VP9_CONFIG *const oxcf = &cpi->oxcf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc); lc->rc = cpi->rc; lc->twopass = cpi->twopass; lc->target_bandwidth = (int)oxcf->target_bandwidth; - lc->starting_buffer_level = oxcf->starting_buffer_level; - lc->optimal_buffer_level = oxcf->optimal_buffer_level; - lc->maximum_buffer_size = oxcf->maximum_buffer_size; - lc->framerate = cpi->output_framerate; + lc->alt_ref_source = cpi->alt_ref_source; } void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { @@ -205,7 +203,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { int i; for (i = 0; i < svc->number_spatial_layers; ++i) { - struct twopass_rc *const twopass = &svc->layer_context[i].twopass; + TWO_PASS *const twopass = &svc->layer_context[i].twopass; svc->spatial_layer_id = i; vp9_init_second_pass(cpi); @@ -222,3 +220,145 @@ void vp9_inc_frame_in_layer(SVC *svc) { : &svc->layer_context[svc->spatial_layer_id]; ++lc->current_video_frame_in_layer; } + +int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { + return is_spatial_svc(cpi) && + cpi->svc.spatial_layer_id > 0 && + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame; +} + +#if CONFIG_SPATIAL_SVC +int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx, + YV12_BUFFER_CONFIG *src, int64_t ts_start, + int64_t ts_end, unsigned int flags) { + struct lookahead_entry *buf; + int i, index; + + if (vp9_lookahead_push(ctx, src, ts_start, ts_end, flags)) + return 1; + + index = ctx->write_idx - 1; + if (index < 0) + index += ctx->max_sz; + + buf = ctx->buf + index; + + if (buf == NULL) + return 1; + + // Store svc parameters for each layer + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) + buf->svc_params[i] = cpi->svc.layer_context[i].svc_params_received; + + return 0; +} + +static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { + int layer_id; + vpx_svc_parameters_t *layer_param; + LAYER_CONTEXT *lc; + + // Find the next layer to be encoded + for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) { + if (buf->svc_params[layer_id].spatial_layer >=0) + break; + } + + if (layer_id == cpi->svc.number_spatial_layers) + return 1; + + layer_param = &buf->svc_params[layer_id]; + cpi->svc.spatial_layer_id = layer_param->spatial_layer; + cpi->svc.temporal_layer_id = layer_param->temporal_layer; + cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + + cpi->lst_fb_idx = cpi->svc.spatial_layer_id; + + if (cpi->svc.spatial_layer_id < 1) + cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ? + lc->gold_ref_idx : cpi->lst_fb_idx; + else + cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1; + + if (lc->current_video_frame_in_layer == 0) { + if (cpi->svc.spatial_layer_id >= 2) { + cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; + } else { + cpi->alt_fb_idx = cpi->lst_fb_idx; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); + } + } else { + if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) { + cpi->alt_fb_idx = lc->alt_ref_idx; + if (!lc->has_alt_frame) + cpi->ref_frame_flags &= (~VP9_ALT_FLAG); + } else { + // Find a proper alt_fb_idx for layers that don't have alt ref frame + if (cpi->svc.spatial_layer_id == 0) { + cpi->alt_fb_idx = cpi->lst_fb_idx; + } else { + LAYER_CONTEXT *lc_lower = + &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1]; + + if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id - 1] && + lc_lower->alt_ref_source != NULL) + cpi->alt_fb_idx = lc_lower->alt_ref_idx; + else if (cpi->svc.spatial_layer_id >= 2) + cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; + else + cpi->alt_fb_idx = cpi->lst_fb_idx; + } + } + } + + if (vp9_set_size_literal(cpi, layer_param->width, layer_param->height) != 0) + return VPX_CODEC_INVALID_PARAM; + + cpi->oxcf.worst_allowed_q = + vp9_quantizer_to_qindex(layer_param->max_quantizer); + cpi->oxcf.best_allowed_q = + vp9_quantizer_to_qindex(layer_param->min_quantizer); + + vp9_change_config(cpi, &cpi->oxcf); + + vp9_set_high_precision_mv(cpi, 1); + + cpi->alt_ref_source = get_layer_context(&cpi->svc)->alt_ref_source; + + return 0; +} + +struct lookahead_entry *vp9_svc_lookahead_peek(VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int index, int copy_params) { + struct lookahead_entry *buf = vp9_lookahead_peek(ctx, index); + + if (buf != NULL && copy_params != 0) { + if (copy_svc_params(cpi, buf) != 0) + return NULL; + } + return buf; +} + +struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int drain) { + struct lookahead_entry *buf = NULL; + + if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { + buf = vp9_svc_lookahead_peek(cpi, ctx, 0, 1); + if (buf != NULL) { + // Only remove the buffer when pop the highest layer. Simply set the + // spatial_layer to -1 for lower layers. + buf->svc_params[cpi->svc.spatial_layer_id].spatial_layer = -1; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + vp9_lookahead_pop(ctx, drain); + } + } + } + + return buf; +} +#endif diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h index 2abed3055..801449b6f 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -22,14 +22,18 @@ extern "C" { typedef struct { RATE_CONTROL rc; int target_bandwidth; - int64_t starting_buffer_level; - int64_t optimal_buffer_level; - int64_t maximum_buffer_size; double framerate; int avg_frame_size; - struct twopass_rc twopass; + TWO_PASS twopass; struct vpx_fixed_buf rc_twopass_stats_in; unsigned int current_video_frame_in_layer; + int is_key_frame; + vpx_svc_parameters_t svc_params_received; + struct lookahead_entry *alt_ref_source; + int alt_ref_idx; + int gold_ref_idx; + int has_alt_frame; + size_t layer_size; } LAYER_CONTEXT; typedef struct { @@ -37,6 +41,11 @@ typedef struct { int temporal_layer_id; int number_spatial_layers; int number_temporal_layers; + + // Store scaled source frames to be used for temporal filter to generate + // a alt ref frame. + YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS]; + // Layer context used for rate control in one pass temporal CBR mode or // two pass spatial mode. Defined for temporal or spatial layers for now. // Does not support temporal combined with spatial RC. @@ -73,6 +82,26 @@ void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); // Increment number of video frames in layer void vp9_inc_frame_in_layer(SVC *svc); +// Check if current layer is key frame in spatial upper layer +int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi); + +// Copy the source image, flags and svc parameters into a new framebuffer +// with the expected stride/border +int vp9_svc_lookahead_push(const struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, + unsigned int flags); + +// Get the next source buffer to encode +struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int drain); + +// Get a future source buffer to encode +struct lookahead_entry *vp9_svc_lookahead_peek(struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int index, int copy_params); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index 041027354..ce3b31138 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -19,7 +19,7 @@ #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" @@ -27,14 +27,15 @@ #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" -#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering +static int fixed_divide[512]; static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, - int uv_block_size, + int uv_block_width, + int uv_block_height, int mv_row, int mv_col, uint8_t *pred, @@ -47,7 +48,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, enum mv_precision mv_precision_uv; int uv_stride; - if (uv_block_size == 8) { + if (uv_block_width == 8) { uv_stride = (stride + 1) >> 1; mv_precision_uv = MV_PRECISION_Q4; } else { @@ -64,26 +65,35 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, kernel, MV_PRECISION_Q3, x, y); vp9_build_inter_predictor(u_mb_ptr, uv_stride, - &pred[256], uv_block_size, + &pred[256], uv_block_width, &mv, scale, - uv_block_size, uv_block_size, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y); vp9_build_inter_predictor(v_mb_ptr, uv_stride, - &pred[512], uv_block_size, + &pred[512], uv_block_width, &mv, scale, - uv_block_size, uv_block_size, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y); } +void vp9_temporal_filter_init() { + int i; + + fixed_divide[0] = 0; + for (i = 1; i < 512; ++i) + fixed_divide[i] = 0x80000 / i; +} + void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, - unsigned int block_size, + unsigned int block_width, + unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, @@ -91,9 +101,10 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int i, j, k; int modifier; int byte = 0; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; - for (i = 0, k = 0; i < block_size; i++) { - for (j = 0; j < block_size; j++, k++) { + for (i = 0, k = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++, k++) { int src_byte = frame1[byte]; int pixel_value = *frame2++; @@ -103,7 +114,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); modifier *= modifier; modifier *= 3; - modifier += 1 << (strength - 1); + modifier += rounding; modifier >>= strength; if (modifier > 16) @@ -118,21 +129,22 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, byte++; } - byte += stride - block_size; + byte += stride - block_width; } } -#if ALT_REF_MC_ENABLED - static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, uint8_t *arf_frame_buf, uint8_t *frame_ptr_buf, int stride) { - MACROBLOCK *x = &cpi->mb; - MACROBLOCKD* const xd = &x->e_mbd; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; int step_param; int sadpb = x->sadperbit16; int bestsme = INT_MAX; + int distortion; + unsigned int sse; MV best_ref_mv1 = {0, 0}; MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ @@ -151,33 +163,22 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, xd->plane[0].pre[0].buf = frame_ptr_buf; xd->plane[0].pre[0].stride = stride; - // Further step/diamond searches as necessary - if (cpi->speed < 8) - step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0); - else - step_param = cpi->sf.reduce_first_step_size + 2; - step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); + step_param = mv_sf->reduce_first_step_size; + step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2); - /*cpi->sf.search_method == HEX*/ // Ignore mv costing by sending NULL pointer instead of cost arrays vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv); - // Try sub-pixel MC? - // if (bestsme > error_thresh && bestsme < INT_MAX) - { - int distortion; - unsigned int sse; - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, ref_mv, - &best_ref_mv1, - cpi->common.allow_high_precision_mv, - x->errorperbit, - &cpi->fn_ptr[BLOCK_16X16], - 0, cpi->sf.subpel_iters_per_step, - NULL, NULL, - &distortion, &sse); - } + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step(x, ref_mv, + &best_ref_mv1, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], + 0, mv_sf->subpel_iters_per_step, + NULL, NULL, + &distortion, &sse, NULL, 0, 0); // Restore input state x->plane[0].src = src; @@ -185,7 +186,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, return bestsme; } -#endif static void temporal_filter_iterate_c(VP9_COMP *cpi, int frame_count, @@ -207,20 +207,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, uint8_t *dst1, *dst2; DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3); const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; + const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; // Save input state uint8_t* input_buffer[MAX_MB_PLANE]; int i; - // TODO(aconverse): Add 4:2:2 support - assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y); - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; for (mb_row = 0; mb_row < mb_rows; mb_row++) { -#if ALT_REF_MC_ENABLED - // Source frames are extended to 16 pixels. This is different than + // Source frames are extended to 16 pixels. This is different than // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) // A 6/8 tap filter is used for motion search. This requires 2 pixels // before and 3 pixels after. So the largest Y mv on a border would @@ -234,7 +231,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); -#endif for (mb_col = 0; mb_col < mb_cols; mb_col++) { int i, j, k; @@ -243,13 +239,14 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); -#if ALT_REF_MC_ENABLED cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); -#endif for (frame = 0; frame < frame_count; frame++) { + const int thresh_low = 10000; + const int thresh_high = 20000; + if (cpi->frames[frame] == NULL) continue; @@ -259,51 +256,45 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (frame == alt_ref_index) { filter_weight = 2; } else { - int err = 0; -#if ALT_REF_MC_ENABLED -#define THRESH_LOW 10000 -#define THRESH_HIGH 20000 - // Find best match in this frame by MC - err = temporal_filter_find_matching_mb_c - (cpi, - cpi->frames[alt_ref_index]->y_buffer + mb_y_offset, - cpi->frames[frame]->y_buffer + mb_y_offset, - cpi->frames[frame]->y_stride); -#endif + int err = temporal_filter_find_matching_mb_c(cpi, + cpi->frames[alt_ref_index]->y_buffer + mb_y_offset, + cpi->frames[frame]->y_buffer + mb_y_offset, + cpi->frames[frame]->y_stride); + // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior // is to weight all MBs equal. - filter_weight = err < THRESH_LOW - ? 2 : err < THRESH_HIGH ? 1 : 0; + filter_weight = err < thresh_low + ? 2 : err < thresh_high ? 1 : 0; } if (filter_weight != 0) { // Construct the predictors - temporal_filter_predictors_mb_c - (mbd, - cpi->frames[frame]->y_buffer + mb_y_offset, - cpi->frames[frame]->u_buffer + mb_uv_offset, - cpi->frames[frame]->v_buffer + mb_uv_offset, - cpi->frames[frame]->y_stride, - mb_uv_height, - mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, - predictor, scale, - mb_col * 16, mb_row * 16); + temporal_filter_predictors_mb_c(mbd, + cpi->frames[frame]->y_buffer + mb_y_offset, + cpi->frames[frame]->u_buffer + mb_uv_offset, + cpi->frames[frame]->v_buffer + mb_uv_offset, + cpi->frames[frame]->y_stride, + mb_uv_width, mb_uv_height, + mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, + mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, + predictor, scale, + mb_col * 16, mb_row * 16); // Apply the filter (YUV) vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, strength, filter_weight, + predictor, 16, 16, + strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_height, strength, + predictor + 256, + mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 256, count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_height, strength, + predictor + 512, + mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); } @@ -316,7 +307,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, for (i = 0, k = 0; i < 16; i++) { for (j = 0; j < 16; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= cpi->fixed_divide[count[k]]; + pval *= fixed_divide[count[k]]; pval >>= 19; dst1[byte] = (uint8_t)pval; @@ -324,7 +315,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, // move to next pixel byte++; } - byte += stride - 16; } @@ -333,34 +323,31 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_height; j++, k++) { + for (j = 0; j < mb_uv_width; j++, k++) { int m = k + 256; // U unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= cpi->fixed_divide[count[k]]; + pval *= fixed_divide[count[k]]; pval >>= 19; dst1[byte] = (uint8_t)pval; // V pval = accumulator[m] + (count[m] >> 1); - pval *= cpi->fixed_divide[count[m]]; + pval *= fixed_divide[count[m]]; pval >>= 19; dst2[byte] = (uint8_t)pval; // move to next pixel byte++; } - - byte += stride - mb_uv_height; + byte += stride - mb_uv_width; } - mb_y_offset += 16; - mb_uv_offset += mb_uv_height; + mb_uv_offset += mb_uv_width; } - mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols); + mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; } // Restore input state @@ -368,154 +355,32 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, mbd->plane[i].pre[0].buf = input_buffer[i]; } -void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { - VP9_COMMON *const cm = &cpi->common; - - int frame = 0; - - int frames_to_blur_backward = 0; - int frames_to_blur_forward = 0; - int frames_to_blur = 0; - int start_frame = 0; - - int strength = cpi->active_arnr_strength; - int blur_type = cpi->oxcf.arnr_type; - int max_frames = cpi->active_arnr_frames; - - const int num_frames_backward = distance; - const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead) - - (num_frames_backward + 1); - struct scale_factors sf; - - switch (blur_type) { - case 1: - // Backward Blur - frames_to_blur_backward = num_frames_backward; - - if (frames_to_blur_backward >= max_frames) - frames_to_blur_backward = max_frames - 1; - - frames_to_blur = frames_to_blur_backward + 1; - break; - - case 2: - // Forward Blur - frames_to_blur_forward = num_frames_forward; - - if (frames_to_blur_forward >= max_frames) - frames_to_blur_forward = max_frames - 1; - - frames_to_blur = frames_to_blur_forward + 1; - break; - - case 3: - default: - // Center Blur - frames_to_blur_forward = num_frames_forward; - frames_to_blur_backward = num_frames_backward; - - if (frames_to_blur_forward > frames_to_blur_backward) - frames_to_blur_forward = frames_to_blur_backward; - - if (frames_to_blur_backward > frames_to_blur_forward) - frames_to_blur_backward = frames_to_blur_forward; - - // When max_frames is even we have 1 more frame backward than forward - if (frames_to_blur_forward > (max_frames - 1) / 2) - frames_to_blur_forward = ((max_frames - 1) / 2); - - if (frames_to_blur_backward > (max_frames / 2)) - frames_to_blur_backward = (max_frames / 2); - - frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; - break; - } - - start_frame = distance + frames_to_blur_forward; - -#ifdef DEBUGFWG - // DEBUG FWG - printf( - "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d " - "start:%d", - max_frames, num_frames_backward, num_frames_forward, frames_to_blur, - frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index, - cpi->last_alt_ref_sei, start_frame); -#endif - - // Setup scaling factors. Scaling on each of the arnr frames is not supported - vp9_setup_scale_factors_for_frame(&sf, - get_frame_new_buffer(cm)->y_crop_width, - get_frame_new_buffer(cm)->y_crop_height, - cm->width, cm->height); - - // Setup frame pointers, NULL indicates frame not included in filter - vp9_zero(cpi->frames); - for (frame = 0; frame < frames_to_blur; frame++) { - int which_buffer = start_frame - frame; - struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, - which_buffer); - cpi->frames[frames_to_blur - 1 - frame] = &buf->img; - } +// Apply buffer limits and context specific adjustments to arnr filter. +static void adjust_arnr_filter(VP9_COMP *cpi, + int distance, int group_boost) { + const int frames_after_arf = + vp9_lookahead_depth(cpi->lookahead) - distance - 1; + int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; + int frames_bwd; + int q; - temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, - strength, &sf); -} + // Define the forward and backwards filter limits for this arnr group. + if (frames_fwd > frames_after_arf) + frames_fwd = frames_after_arf; + if (frames_fwd > distance) + frames_fwd = distance; -void vp9_configure_arnr_filter(VP9_COMP *cpi, - const unsigned int frames_to_arnr, - const int group_boost) { - int half_gf_int; - int frames_after_arf; - int frames_bwd = cpi->oxcf.arnr_max_frames - 1; - int frames_fwd = cpi->oxcf.arnr_max_frames - 1; - int q; + frames_bwd = frames_fwd; - // Define the arnr filter width for this group of frames. We only - // filter frames that lie within a distance of half the GF interval - // from the ARF frame. We also have to trap cases where the filter - // extends beyond the end of the lookahead buffer. - // Note: frames_to_arnr parameter is the offset of the arnr - // frame from the current frame. - half_gf_int = cpi->rc.baseline_gf_interval >> 1; - frames_after_arf = vp9_lookahead_depth(cpi->lookahead) - - frames_to_arnr - 1; - - switch (cpi->oxcf.arnr_type) { - case 1: // Backward filter - frames_fwd = 0; - if (frames_bwd > half_gf_int) - frames_bwd = half_gf_int; - break; - - case 2: // Forward filter - if (frames_fwd > half_gf_int) - frames_fwd = half_gf_int; - if (frames_fwd > frames_after_arf) - frames_fwd = frames_after_arf; - frames_bwd = 0; - break; - - case 3: // Centered filter - default: - frames_fwd >>= 1; - if (frames_fwd > frames_after_arf) - frames_fwd = frames_after_arf; - if (frames_fwd > half_gf_int) - frames_fwd = half_gf_int; - - frames_bwd = frames_fwd; - - // For even length filter there is one more frame backward - // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. - if (frames_bwd < half_gf_int) - frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1; - break; - } + // For even length filter there is one more frame backward + // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. + if (frames_bwd < distance) + frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1; + // Set the baseline active filter size. cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; - // Adjust the strength based on active max q + // Adjust the strength based on active max q. if (cpi->common.current_video_frame > 1) q = ((int)vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME])); @@ -538,4 +403,79 @@ void vp9_configure_arnr_filter(VP9_COMP *cpi, if (cpi->active_arnr_strength > (group_boost / 300)) { cpi->active_arnr_strength = (group_boost / 300); } + + // Adjustments for second level arf in multi arf case. + if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { + cpi->active_arnr_strength >>= 1; + } + } +} + +void vp9_temporal_filter(VP9_COMP *cpi, int distance) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int frame; + int frames_to_blur; + int start_frame; + int strength; + int frames_to_blur_backward; + int frames_to_blur_forward; + struct scale_factors sf; + + // Apply context specific adjustments to the arnr filter parameters. + adjust_arnr_filter(cpi, distance, rc->gfu_boost); + strength = cpi->active_arnr_strength; + frames_to_blur = cpi->active_arnr_frames; + frames_to_blur_backward = (frames_to_blur / 2); + frames_to_blur_forward = ((frames_to_blur - 1) / 2); + start_frame = distance + frames_to_blur_forward; + + // Setup frame pointers, NULL indicates frame not included in filter. + vp9_zero(cpi->frames); + for (frame = 0; frame < frames_to_blur; ++frame) { + const int which_buffer = start_frame - frame; + struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, + which_buffer); + cpi->frames[frames_to_blur - 1 - frame] = &buf->img; + } + + // Setup scaling factors. Scaling on each of the arnr frames is not supported + if (is_spatial_svc(cpi)) { + // In spatial svc the scaling factors might be less then 1/2. So we will use + // non-normative scaling. + int frame_used = 0; + vp9_setup_scale_factors_for_frame(&sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height); + + for (frame = 0; frame < frames_to_blur; ++frame) { + if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width || + cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) { + if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used], + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, + NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate alt_ref_buffer"); + + cpi->frames[frame] = + vp9_scale_if_required(cm, cpi->frames[frame], + &cpi->svc.scaled_frames[frame_used]); + ++frame_used; + } + } + } else { + vp9_setup_scale_factors_for_frame(&sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + cm->width, cm->height); + } + + temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, + strength, &sf); } diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.h b/libvpx/vp9/encoder/vp9_temporal_filter.h index 3028d7884..a971e0ae3 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -15,10 +15,8 @@ extern "C" { #endif -void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance); -void vp9_configure_arnr_filter(VP9_COMP *cpi, - const unsigned int frames_to_arnr, - const int group_boost); +void vp9_temporal_filter_init(); +void vp9_temporal_filter(VP9_COMP *cpi, int distance); #ifdef __cplusplus } // extern "C" diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c index 291ccb37e..6068b85a0 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libvpx/vp9/encoder/vp9_tokenize.c @@ -20,7 +20,7 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_cost.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_tokenize.h" static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; @@ -55,15 +55,6 @@ const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = { -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE }; -static const vp9_prob Pcat1[] = { 159}; -static const vp9_prob Pcat2[] = { 165, 145}; -static const vp9_prob Pcat3[] = { 173, 148, 140}; -static const vp9_prob Pcat4[] = { 176, 155, 140, 135}; -static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130}; -static const vp9_prob Pcat6[] = { - 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 -}; - static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; static void init_bit_tree(vp9_tree_index *p, int n) { @@ -87,18 +78,18 @@ static void init_bit_trees() { } const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = { - {0, 0, 0, 0}, // ZERO_TOKEN - {0, 0, 0, 1}, // ONE_TOKEN - {0, 0, 0, 2}, // TWO_TOKEN - {0, 0, 0, 3}, // THREE_TOKEN - {0, 0, 0, 4}, // FOUR_TOKEN - {cat1, Pcat1, 1, 5}, // CATEGORY1_TOKEN - {cat2, Pcat2, 2, 7}, // CATEGORY2_TOKEN - {cat3, Pcat3, 3, 11}, // CATEGORY3_TOKEN - {cat4, Pcat4, 4, 19}, // CATEGORY4_TOKEN - {cat5, Pcat5, 5, 35}, // CATEGORY5_TOKEN - {cat6, Pcat6, 14, 67}, // CATEGORY6_TOKEN - {0, 0, 0, 0} // EOB_TOKEN + {0, 0, 0, 0}, // ZERO_TOKEN + {0, 0, 0, 1}, // ONE_TOKEN + {0, 0, 0, 2}, // TWO_TOKEN + {0, 0, 0, 3}, // THREE_TOKEN + {0, 0, 0, 4}, // FOUR_TOKEN + {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN + {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN + {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN + {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN + {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN + {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL}, // CATEGORY6_TOKEN + {0, 0, 0, 0} // EOB_TOKEN }; struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS]; @@ -232,7 +223,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, cpi->common.fc.coef_probs[tx_size][type][ref]; unsigned int (*const eob_branch)[COEFF_CONTEXTS] = cpi->common.counts.eob_branch[tx_size][type][ref]; - const uint8_t *const band = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); @@ -289,14 +279,17 @@ struct is_skippable_args { MACROBLOCK *x; int *skippable; }; - static void is_skippable(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) { struct is_skippable_args *args = argv; + (void)plane_bsize; + (void)tx_size; args->skippable[0] &= (!args->x->plane[plane].eobs[block]); } +// TODO(yaowu): rewrite and optimize this function to remove the usage of +// vp9_foreach_transform_block() and simplify is_skippable(). int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { int result = 1; struct is_skippable_args args = {x, &result}; diff --git a/libvpx/vp9/encoder/vp9_variance.c b/libvpx/vp9/encoder/vp9_variance.c index 71867a938..eb5ae2e41 100644 --- a/libvpx/vp9/encoder/vp9_variance.c +++ b/libvpx/vp9/encoder/vp9_variance.c @@ -18,63 +18,34 @@ #include "vp9/encoder/vp9_variance.h" -void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { +void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { int i, j; - int diff; *sum = 0; *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { - diff = src_ptr[j] - ref_ptr[j]; + const int diff = a[j] - b[j]; *sum += diff; *sse += diff * diff; } - src_ptr += source_stride; - ref_ptr += recon_stride; + a += a_stride; + b += b_stride; } } -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input - * samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter - * taps. - * - * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step= - * stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ +// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// first-pass of 2-D separable filter. +// +// Produces int32_t output to retain precision for next pass. Two filter taps +// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It +// defines the offset required to move from one input to the next. static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, uint16_t *output_ptr, unsigned int src_pixels_per_line, @@ -99,38 +70,14 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, } } -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input - * samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter - * taps. - * - * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by - * filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step= - * stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ +// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// second-pass of 2-D separable filter. +// +// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step=1) or vertically (pixel_step= +// stride). It defines the offset required to move from one input to the next. static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -156,949 +103,154 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; - for (i = 0; i < 256; i++) { - sum += (src_ptr[i] * src_ptr[i]); - } + for (i = 0; i < 256; i++) + sum += src_ptr[i] * src_ptr[i]; return sum; } -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); -} - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter); - - return vp9_variance64x32(temp2, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64); - return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); -} - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter); - - return vp9_variance32x64(temp2, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32); - return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); -} - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter); - - return vp9_variance32x16(temp2, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32); - return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); -} - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter); - - return vp9_variance16x32(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16); - return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 12)); -} - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 10)); -} - -void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); -} - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, +#define VAR(W, H) \ +unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define SUBPIX_VAR(W, H) \ +unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ +\ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + BILINEAR_FILTERS_2TAP(xoffset)); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ +} + +#define SUBPIX_AVG_VAR(W, H) \ +unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, H * W); \ +\ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + BILINEAR_FILTERS_2TAP(xoffset)); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ +\ + return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ +} + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); -} - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); -} - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); + variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); } -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); } - -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return var; -} - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return var; + int sum; + variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum); + return *sse; } -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return var; -} - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return var; -} - - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - // First filter 1d Horizontal - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 4, hfilter); - - // Now filter Verticaly - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter); - - return vp9_variance4x4(temp2, 4, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer - uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - // First filter 1d Horizontal - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 4, hfilter); - - // Now filter Verticaly - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4); - return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter); - - return vp9_variance8x8(temp2, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8); - return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[17 * 16]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter); - - return vp9_variance16x16(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[17 * 16]; - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter); - - vp9_comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16); - return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter); - - return vp9_variance64x64(temp2, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64); - return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter); - - return vp9_variance32x32(temp2, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32); - return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, - xoffset, yoffset, dst_ptr, - dst_pixels_per_line, sse); + int sum; + variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum); return *sse; } -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line, - xoffset, yoffset, dst_ptr, - dst_pixels_per_line, sse); +unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum); return *sse; } -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line, - xoffset, yoffset, dst_ptr, - dst_pixels_per_line, sse); +unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum); return *sse; } -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); +VAR(4, 4) +SUBPIX_VAR(4, 4) +SUBPIX_AVG_VAR(4, 4) - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter); +VAR(4, 8) +SUBPIX_VAR(4, 8) +SUBPIX_AVG_VAR(4, 8) - return vp9_variance16x8(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} +VAR(8, 4) +SUBPIX_VAR(8, 4) +SUBPIX_AVG_VAR(8, 4) -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer - const int16_t *hfilter, *vfilter; +VAR(8, 8) +SUBPIX_VAR(8, 8) +SUBPIX_AVG_VAR(8, 8) - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); +VAR(8, 16) +SUBPIX_VAR(8, 16) +SUBPIX_AVG_VAR(8, 16) - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16); - return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse); -} +VAR(16, 8) +SUBPIX_VAR(16, 8) +SUBPIX_AVG_VAR(16, 8) -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; +VAR(16, 16) +SUBPIX_VAR(16, 16) +SUBPIX_AVG_VAR(16, 16) - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter); - - return vp9_variance8x16(temp2, 8, dst_ptr, dst_pixels_per_line, sse); -} +VAR(16, 32) +SUBPIX_VAR(16, 32) +SUBPIX_AVG_VAR(16, 32) -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer - const int16_t *hfilter, *vfilter; +VAR(32, 16) +SUBPIX_VAR(32, 16) +SUBPIX_AVG_VAR(32, 16) - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); +VAR(32, 32) +SUBPIX_VAR(32, 32) +SUBPIX_AVG_VAR(32, 32) - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8); - return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter); - - return vp9_variance8x4(temp2, 8, dst_ptr, dst_pixels_per_line, sse); -} +VAR(32, 64) +SUBPIX_VAR(32, 64) +SUBPIX_AVG_VAR(32, 64) -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8); - return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering - // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be - // of this big? same issue appears in all other block size settings. - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 4, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter); - - return vp9_variance4x8(temp2, 4, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 4, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter); - vp9_comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4); - return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse); -} +VAR(64, 32) +SUBPIX_VAR(64, 32) +SUBPIX_AVG_VAR(64, 32) +VAR(64, 64) +SUBPIX_VAR(64, 64) +SUBPIX_AVG_VAR(64, 64) void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { @@ -1106,9 +258,8 @@ void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { - int tmp; - tmp = pred[j] + ref[j]; - comp_pred[j] = (tmp + 1) >> 1; + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } comp_pred += width; pred += width; diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h index 62e20dc00..4a194b72c 100644 --- a/libvpx/vp9/encoder/vp9_variance.h +++ b/libvpx/vp9/encoder/vp9_variance.h @@ -17,27 +17,21 @@ extern "C" { #endif -void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum); +void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, + unsigned int *sse, int *sum); typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad); + int ref_stride); typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - unsigned int max_sad); + const uint8_t *second_pred); typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, @@ -45,12 +39,6 @@ typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int *sad_array); -typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array); - typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t* const ref_ptr[], @@ -79,24 +67,14 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr, unsigned int *sse, const uint8_t *second_pred); -typedef unsigned int (*vp9_getmbss_fn_t)(const short *); - -typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride); - typedef struct vp9_variance_vtable { vp9_sad_fn_t sdf; vp9_sad_avg_fn_t sdaf; vp9_variance_fn_t vf; vp9_subpixvariance_fn_t svf; vp9_subp_avg_variance_fn_t svaf; - vp9_variance_fn_t svf_halfpix_h; - vp9_variance_fn_t svf_halfpix_v; - vp9_variance_fn_t svf_halfpix_hv; vp9_sad_multi_fn_t sdx3f; - vp9_sad_multi1_fn_t sdx8f; + vp9_sad_multi_fn_t sdx8f; vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; diff --git a/libvpx/vp9/encoder/vp9_write_bit_buffer.c b/libvpx/vp9/encoder/vp9_write_bit_buffer.c index 962d0ca56..6d55e84e8 100644 --- a/libvpx/vp9/encoder/vp9_write_bit_buffer.c +++ b/libvpx/vp9/encoder/vp9_write_bit_buffer.c @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> #include "vp9/encoder/vp9_write_bit_buffer.h" -size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) { +size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) { return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); } diff --git a/libvpx/vp9/encoder/vp9_write_bit_buffer.h b/libvpx/vp9/encoder/vp9_write_bit_buffer.h index 073608d7f..59f9bbe30 100644 --- a/libvpx/vp9/encoder/vp9_write_bit_buffer.h +++ b/libvpx/vp9/encoder/vp9_write_bit_buffer.h @@ -11,8 +11,6 @@ #ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ #define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ -#include <limits.h> - #include "vpx/vpx_integer.h" #ifdef __cplusplus @@ -24,7 +22,7 @@ struct vp9_write_bit_buffer { size_t bit_offset; }; -size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb); +size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb); void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit); diff --git a/libvpx/vp9/encoder/vp9_writer.c b/libvpx/vp9/encoder/vp9_writer.c index 8398fc07a..ff461f218 100644 --- a/libvpx/vp9/encoder/vp9_writer.c +++ b/libvpx/vp9/encoder/vp9_writer.c @@ -15,7 +15,6 @@ void vp9_start_encode(vp9_writer *br, uint8_t *source) { br->lowvalue = 0; br->range = 255; - br->value = 0; br->count = -24; br->buffer = source; br->pos = 0; diff --git a/libvpx/vp9/encoder/vp9_writer.h b/libvpx/vp9/encoder/vp9_writer.h index 7f4fa1ef2..9d161f95c 100644 --- a/libvpx/vp9/encoder/vp9_writer.h +++ b/libvpx/vp9/encoder/vp9_writer.h @@ -22,20 +22,15 @@ extern "C" { typedef struct { unsigned int lowvalue; unsigned int range; - unsigned int value; int count; unsigned int pos; uint8_t *buffer; - - // Variables used to track bit costs without outputing to the bitstream - unsigned int measure_cost; - uint64_t bit_counter; } vp9_writer; void vp9_start_encode(vp9_writer *bc, uint8_t *buffer); void vp9_stop_encode(vp9_writer *bc); -static void vp9_write(vp9_writer *br, int bit, int probability) { +static INLINE void vp9_write(vp9_writer *br, int bit, int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; @@ -83,11 +78,11 @@ static void vp9_write(vp9_writer *br, int bit, int probability) { br->range = range; } -static void vp9_write_bit(vp9_writer *w, int bit) { +static INLINE void vp9_write_bit(vp9_writer *w, int bit) { vp9_write(w, bit, 128); // vp9_prob_half } -static void vp9_write_literal(vp9_writer *w, int data, int bits) { +static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c index 2d59775ce..42fdbbdc5 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -12,6 +12,9 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +#define pair_set_epi32(a, b) \ + _mm_set_epi32(b, a, b, a) + #if FDCT32x32_HIGH_PRECISION static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { __m128i buf0, buf1; diff --git a/libvpx/vp9/encoder/x86/vp9_dct_avx2.c b/libvpx/vp9/encoder/x86/vp9_dct_avx2.c index b5269ed03..3a19f5274 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_avx2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_avx2.c @@ -12,2572 +12,6 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" -void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - const __m128i kOne = _mm_set1_epi16(1); - __m128i in0, in1, in2, in3; - // Load inputs. - { - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - // x = x << 4 - in0 = _mm_slli_epi16(in0, 4); - in1 = _mm_slli_epi16(in1, 4); - in2 = _mm_slli_epi16(in2, 4); - in3 = _mm_slli_epi16(in3, 4); - // if (i == 0 && input[0]) input[0] += 1; - { - // The mask will only contain whether the first value is zero, all - // other comparison will fail as something shifted by 4 (above << 4) - // can never be equal to one. To increment in the non-zero case, we - // add the mask and one for the first element: - // - if zero, mask = -1, v = v - 1 + 1 = v - // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 - __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); - in0 = _mm_add_epi16(in0, mask); - in0 = _mm_add_epi16(in0, k__nonzero_bias_b); - } - } - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // Transform 1/2: Add/subtract - const __m128i r0 = _mm_add_epi16(in0, in3); - const __m128i r1 = _mm_add_epi16(in1, in2); - const __m128i r2 = _mm_sub_epi16(in1, in2); - const __m128i r3 = _mm_sub_epi16(in0, in3); - // Transform 1/2: Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - // Combine and transpose - const __m128i res0 = _mm_packs_epi32(w0, w2); - const __m128i res1 = _mm_packs_epi32(w4, w6); - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 - // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 - if (0 == pass) { - // Extract values in the high part for second pass as transform code - // only uses the first four values. - in1 = _mm_unpackhi_epi64(in0, in0); - in3 = _mm_unpackhi_epi64(in2, in2); - } else { - // Post-condition output and store it (v + 1) >> 2, taking advantage - // of the fact 1/3 are stored just after 0/2. - __m128i out01 = _mm_add_epi16(in0, kOne); - __m128i out23 = _mm_add_epi16(in2, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); - _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); - } - } -} - -static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in, - int stride) { - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i mask; - - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - - in[0] = _mm_slli_epi16(in[0], 4); - in[1] = _mm_slli_epi16(in[1], 4); - in[2] = _mm_slli_epi16(in[2], 4); - in[3] = _mm_slli_epi16(in[3], 4); - - mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); - in[0] = _mm_add_epi16(in[0], mask); - in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); -} - -static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) { - const __m128i kOne = _mm_set1_epi16(1); - __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); - __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); - __m128i out01 = _mm_add_epi16(in01, kOne); - __m128i out23 = _mm_add_epi16(in23, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_store_si128((__m128i *)(output + 0 * 8), out01); - _mm_store_si128((__m128i *)(output + 1 * 8), out23); -} - -static INLINE void transpose_4x4_avx2(__m128i *res) { - // Combine and transpose - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); - - // 00 10 20 30 01 11 21 31 - // 02 12 22 32 03 13 23 33 - // only use the first 4 16-bit integers - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); -} - -void fdct4_avx2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u[4], v[4]; - u[0]=_mm_unpacklo_epi16(in[0], in[1]); - u[1]=_mm_unpacklo_epi16(in[3], in[2]); - - v[0] = _mm_add_epi16(u[0], u[1]); - v[1] = _mm_sub_epi16(u[0], u[1]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 - u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 - u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 - u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); - transpose_4x4_avx2(in); -} - -void fadst4_avx2(__m128i *in) { - const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); - const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); - const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); - const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - __m128i in7 = _mm_add_epi16(in[0], in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[2], in[3]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[2], kZero); - u[4] = _mm_unpacklo_epi16(in[3], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 - v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 - v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_sub_epi32(v[2], v[6]); - u[2] = _mm_add_epi32(v[3], v[4]); - u[3] = _mm_sub_epi32(u[2], u[0]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_sub_epi32(u[4], v[5]); - u[6] = _mm_add_epi32(u[3], u[5]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - transpose_4x4_avx2(in); -} - -void vp9_fht4x4_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in[4]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct4x4_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_4x4_avx2(input, in, stride); - fadst4_avx2(in); - fdct4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - case DCT_ADST: - load_buffer_4x4_avx2(input, in, stride); - fdct4_avx2(in); - fadst4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - case ADST_ADST: - load_buffer_4x4_avx2(input, in, stride); - fadst4_avx2(in); - fadst4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - default: - assert(0); - break; - } -} - -void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - // store results - _mm_store_si128((__m128i *)(output + 0 * 8), in0); - _mm_store_si128((__m128i *)(output + 1 * 8), in1); - _mm_store_si128((__m128i *)(output + 2 * 8), in2); - _mm_store_si128((__m128i *)(output + 3 * 8), in3); - _mm_store_si128((__m128i *)(output + 4 * 8), in4); - _mm_store_si128((__m128i *)(output + 5 * 8), in5); - _mm_store_si128((__m128i *)(output + 6 * 8), in6); - _mm_store_si128((__m128i *)(output + 7 * 8), in7); - } -} - -// load 8x8 array -static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in, - int stride) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - in[0] = _mm_slli_epi16(in[0], 2); - in[1] = _mm_slli_epi16(in[1], 2); - in[2] = _mm_slli_epi16(in[2], 2); - in[3] = _mm_slli_epi16(in[3], 2); - in[4] = _mm_slli_epi16(in[4], 2); - in[5] = _mm_slli_epi16(in[5], 2); - in[6] = _mm_slli_epi16(in[6], 2); - in[7] = _mm_slli_epi16(in[7], 2); -} - -// right shift and rounding -static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) { - const __m128i kOne = _mm_set1_epi16(1); - const int bit_m02 = bit - 2; - __m128i sign0 = _mm_srai_epi16(res[0], 15); - __m128i sign1 = _mm_srai_epi16(res[1], 15); - __m128i sign2 = _mm_srai_epi16(res[2], 15); - __m128i sign3 = _mm_srai_epi16(res[3], 15); - __m128i sign4 = _mm_srai_epi16(res[4], 15); - __m128i sign5 = _mm_srai_epi16(res[5], 15); - __m128i sign6 = _mm_srai_epi16(res[6], 15); - __m128i sign7 = _mm_srai_epi16(res[7], 15); - - if (bit_m02 >= 0) { - __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); - res[0] = _mm_add_epi16(res[0], k_const_rounding); - res[1] = _mm_add_epi16(res[1], k_const_rounding); - res[2] = _mm_add_epi16(res[2], k_const_rounding); - res[3] = _mm_add_epi16(res[3], k_const_rounding); - res[4] = _mm_add_epi16(res[4], k_const_rounding); - res[5] = _mm_add_epi16(res[5], k_const_rounding); - res[6] = _mm_add_epi16(res[6], k_const_rounding); - res[7] = _mm_add_epi16(res[7], k_const_rounding); - } - - res[0] = _mm_sub_epi16(res[0], sign0); - res[1] = _mm_sub_epi16(res[1], sign1); - res[2] = _mm_sub_epi16(res[2], sign2); - res[3] = _mm_sub_epi16(res[3], sign3); - res[4] = _mm_sub_epi16(res[4], sign4); - res[5] = _mm_sub_epi16(res[5], sign5); - res[6] = _mm_sub_epi16(res[6], sign6); - res[7] = _mm_sub_epi16(res[7], sign7); - - res[0] = _mm_srai_epi16(res[0], bit); - res[1] = _mm_srai_epi16(res[1], bit); - res[2] = _mm_srai_epi16(res[2], bit); - res[3] = _mm_srai_epi16(res[3], bit); - res[4] = _mm_srai_epi16(res[4], bit); - res[5] = _mm_srai_epi16(res[5], bit); - res[6] = _mm_srai_epi16(res[6], bit); - res[7] = _mm_srai_epi16(res[7], bit); -} - -// write 8x8 array -static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) { - _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); - _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); - _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); - _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); - _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); - _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); - _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); - _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); -} - -// perform in-place transpose -static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - -void fdct8_avx2(__m128i *in) { - // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - // stage 1 - s0 = _mm_add_epi16(in[0], in[7]); - s1 = _mm_add_epi16(in[1], in[6]); - s2 = _mm_add_epi16(in[2], in[5]); - s3 = _mm_add_epi16(in[3], in[4]); - s4 = _mm_sub_epi16(in[3], in[4]); - s5 = _mm_sub_epi16(in[2], in[5]); - s6 = _mm_sub_epi16(in[1], in[6]); - s7 = _mm_sub_epi16(in[0], in[7]); - - u0 = _mm_add_epi16(s0, s3); - u1 = _mm_add_epi16(s1, s2); - u2 = _mm_sub_epi16(s1, s2); - u3 = _mm_sub_epi16(s0, s3); - // interleave and perform butterfly multiplication/addition - v0 = _mm_unpacklo_epi16(u0, u1); - v1 = _mm_unpackhi_epi16(u0, u1); - v2 = _mm_unpacklo_epi16(u2, u3); - v3 = _mm_unpackhi_epi16(u2, u3); - - u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); - u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); - u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); - u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); - u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); - u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); - u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); - u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); - - // shift and rounding - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u1); - in[2] = _mm_packs_epi32(u4, u5); - in[4] = _mm_packs_epi32(u2, u3); - in[6] = _mm_packs_epi32(u6, u7); - - // stage 2 - // interleave and perform butterfly multiplication/addition - u0 = _mm_unpacklo_epi16(s6, s5); - u1 = _mm_unpackhi_epi16(s6, s5); - v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - - u0 = _mm_packs_epi32(v0, v1); - u1 = _mm_packs_epi32(v2, v3); - - // stage 3 - s0 = _mm_add_epi16(s4, u0); - s1 = _mm_sub_epi16(s4, u0); - s2 = _mm_sub_epi16(s7, u1); - s3 = _mm_add_epi16(s7, u1); - - // stage 4 - u0 = _mm_unpacklo_epi16(s0, s3); - u1 = _mm_unpackhi_epi16(s0, s3); - u2 = _mm_unpacklo_epi16(s1, s2); - u3 = _mm_unpackhi_epi16(s1, s2); - - v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); - v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); - v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); - v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); - v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); - v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); - v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); - v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v0, v1); - in[3] = _mm_packs_epi32(v4, v5); - in[5] = _mm_packs_epi32(v2, v3); - in[7] = _mm_packs_epi32(v6, v7); - - // transpose - array_transpose_8x8_avx2(in, in); -} - -void fadst8_avx2(__m128i *in) { - // Constants - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - // FIXME(jingning): do subtract using bit inversion? - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); - - // transpose - array_transpose_8x8_avx2(in, in); -} - -void vp9_fht8x8_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in[8]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct8x8_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_8x8_avx2(input, in, stride); - fadst8_avx2(in); - fdct8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - case DCT_ADST: - load_buffer_8x8_avx2(input, in, stride); - fdct8_avx2(in); - fadst8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - case ADST_ADST: - load_buffer_8x8_avx2(input, in, stride); - fadst8_avx2(in); - fadst8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - default: - assert(0); - break; - } -} - -void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); - const int16_t *in = input; - int16_t *out = intermediate; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 16; column_start += 8) { - __m128i in00, in01, in02, in03, in04, in05, in06, in07; - __m128i in08, in09, in10, in11, in12, in13, in14, in15; - __m128i input0, input1, input2, input3, input4, input5, input6, input7; - __m128i step1_0, step1_1, step1_2, step1_3; - __m128i step1_4, step1_5, step1_6, step1_7; - __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - __m128i step3_0, step3_1, step3_2, step3_3; - __m128i step3_4, step3_5, step3_6, step3_7; - __m128i res00, res01, res02, res03, res04, res05, res06, res07; - __m128i res08, res09, res10, res11, res12, res13, res14, res15; - // Load and pre-condition input. - if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); - // x = x << 2 - in00 = _mm_slli_epi16(in00, 2); - in01 = _mm_slli_epi16(in01, 2); - in02 = _mm_slli_epi16(in02, 2); - in03 = _mm_slli_epi16(in03, 2); - in04 = _mm_slli_epi16(in04, 2); - in05 = _mm_slli_epi16(in05, 2); - in06 = _mm_slli_epi16(in06, 2); - in07 = _mm_slli_epi16(in07, 2); - in08 = _mm_slli_epi16(in08, 2); - in09 = _mm_slli_epi16(in09, 2); - in10 = _mm_slli_epi16(in10, 2); - in11 = _mm_slli_epi16(in11, 2); - in12 = _mm_slli_epi16(in12, 2); - in13 = _mm_slli_epi16(in13, 2); - in14 = _mm_slli_epi16(in14, 2); - in15 = _mm_slli_epi16(in15, 2); - } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); - // x = (x + 1) >> 2 - in00 = _mm_add_epi16(in00, kOne); - in01 = _mm_add_epi16(in01, kOne); - in02 = _mm_add_epi16(in02, kOne); - in03 = _mm_add_epi16(in03, kOne); - in04 = _mm_add_epi16(in04, kOne); - in05 = _mm_add_epi16(in05, kOne); - in06 = _mm_add_epi16(in06, kOne); - in07 = _mm_add_epi16(in07, kOne); - in08 = _mm_add_epi16(in08, kOne); - in09 = _mm_add_epi16(in09, kOne); - in10 = _mm_add_epi16(in10, kOne); - in11 = _mm_add_epi16(in11, kOne); - in12 = _mm_add_epi16(in12, kOne); - in13 = _mm_add_epi16(in13, kOne); - in14 = _mm_add_epi16(in14, kOne); - in15 = _mm_add_epi16(in15, kOne); - in00 = _mm_srai_epi16(in00, 2); - in01 = _mm_srai_epi16(in01, 2); - in02 = _mm_srai_epi16(in02, 2); - in03 = _mm_srai_epi16(in03, 2); - in04 = _mm_srai_epi16(in04, 2); - in05 = _mm_srai_epi16(in05, 2); - in06 = _mm_srai_epi16(in06, 2); - in07 = _mm_srai_epi16(in07, 2); - in08 = _mm_srai_epi16(in08, 2); - in09 = _mm_srai_epi16(in09, 2); - in10 = _mm_srai_epi16(in10, 2); - in11 = _mm_srai_epi16(in11, 2); - in12 = _mm_srai_epi16(in12, 2); - in13 = _mm_srai_epi16(in13, 2); - in14 = _mm_srai_epi16(in14, 2); - in15 = _mm_srai_epi16(in15, 2); - } - in += 8; - // Calculate input for the first 8 results. - { - input0 = _mm_add_epi16(in00, in15); - input1 = _mm_add_epi16(in01, in14); - input2 = _mm_add_epi16(in02, in13); - input3 = _mm_add_epi16(in03, in12); - input4 = _mm_add_epi16(in04, in11); - input5 = _mm_add_epi16(in05, in10); - input6 = _mm_add_epi16(in06, in09); - input7 = _mm_add_epi16(in07, in08); - } - // Calculate input for the next 8 results. - { - step1_0 = _mm_sub_epi16(in07, in08); - step1_1 = _mm_sub_epi16(in06, in09); - step1_2 = _mm_sub_epi16(in05, in10); - step1_3 = _mm_sub_epi16(in04, in11); - step1_4 = _mm_sub_epi16(in03, in12); - step1_5 = _mm_sub_epi16(in02, in13); - step1_6 = _mm_sub_epi16(in01, in14); - step1_7 = _mm_sub_epi16(in00, in15); - } - // Work on the first eight values; fdct8(input, even_results); - { - // Add/subtract - const __m128i q0 = _mm_add_epi16(input0, input7); - const __m128i q1 = _mm_add_epi16(input1, input6); - const __m128i q2 = _mm_add_epi16(input2, input5); - const __m128i q3 = _mm_add_epi16(input3, input4); - const __m128i q4 = _mm_sub_epi16(input3, input4); - const __m128i q5 = _mm_sub_epi16(input2, input5); - const __m128i q6 = _mm_sub_epi16(input1, input6); - const __m128i q7 = _mm_sub_epi16(input0, input7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res00 = _mm_packs_epi32(w0, w1); - res08 = _mm_packs_epi32(w2, w3); - res04 = _mm_packs_epi32(w4, w5); - res12 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res02 = _mm_packs_epi32(w0, w1); - res14 = _mm_packs_epi32(w2, w3); - res10 = _mm_packs_epi32(w4, w5); - res06 = _mm_packs_epi32(w6, w7); - } - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_2 = _mm_packs_epi32(w0, w1); - step2_3 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_5 = _mm_packs_epi32(w0, w1); - step2_4 = _mm_packs_epi32(w2, w3); - } - // step 3 - { - step3_0 = _mm_add_epi16(step1_0, step2_3); - step3_1 = _mm_add_epi16(step1_1, step2_2); - step3_2 = _mm_sub_epi16(step1_1, step2_2); - step3_3 = _mm_sub_epi16(step1_0, step2_3); - step3_4 = _mm_sub_epi16(step1_7, step2_4); - step3_5 = _mm_sub_epi16(step1_6, step2_5); - step3_6 = _mm_add_epi16(step1_6, step2_5); - step3_7 = _mm_add_epi16(step1_7, step2_4); - } - // step 4 - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_1 = _mm_packs_epi32(w0, w1); - step2_2 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_6 = _mm_packs_epi32(w0, w1); - step2_5 = _mm_packs_epi32(w2, w3); - } - // step 5 - { - step1_0 = _mm_add_epi16(step3_0, step2_1); - step1_1 = _mm_sub_epi16(step3_0, step2_1); - step1_2 = _mm_sub_epi16(step3_3, step2_2); - step1_3 = _mm_add_epi16(step3_3, step2_2); - step1_4 = _mm_add_epi16(step3_4, step2_5); - step1_5 = _mm_sub_epi16(step3_4, step2_5); - step1_6 = _mm_sub_epi16(step3_7, step2_6); - step1_7 = _mm_add_epi16(step3_7, step2_6); - } - // step 6 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res01 = _mm_packs_epi32(w0, w1); - res09 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res05 = _mm_packs_epi32(w0, w1); - res13 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res11 = _mm_packs_epi32(w0, w1); - res03 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res15 = _mm_packs_epi32(w0, w1); - res07 = _mm_packs_epi32(w2, w3); - } - } - // Transpose the results, do it as two 8x8 transposes. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); - const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); - const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); - const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); - const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); - const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); - const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); - const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); - } - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); - const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); - const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); - const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); - const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); - const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); - const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); - const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - // Store results - _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); - _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); - _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); - _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); - _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); - _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); - _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); - _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); - } - out += 8*16; - } - // Setup in/out for next pass. - in = intermediate; - out = output; - } -} - -static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0, - __m128i *in1, int stride) { - // load first 8 columns - load_buffer_8x8_avx2(input, in0, stride); - load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride); - - input += 8; - // load second 8 columns - load_buffer_8x8_avx2(input, in1, stride); - load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride); -} - -static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0, - __m128i *in1, int stride) { - // write first 8 columns - write_buffer_8x8_avx2(output, in0, stride); - write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride); - // write second 8 columns - output += 8; - write_buffer_8x8_avx2(output, in1, stride); - write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride); -} - -static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8_avx2(res0, res0); - array_transpose_8x8_avx2(res1, tbuf); - array_transpose_8x8_avx2(res0 + 8, res1); - array_transpose_8x8_avx2(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) { - // perform rounding operations - right_shift_8x8_avx2(res0, 2); - right_shift_8x8_avx2(res0 + 8, 2); - right_shift_8x8_avx2(res1, 2); - right_shift_8x8_avx2(res1 + 8, 2); -} - -void fdct16_8col_avx2(__m128i *in) { - // perform 16x16 1-D DCT for 8 columns - __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // stage 1 - i[0] = _mm_add_epi16(in[0], in[15]); - i[1] = _mm_add_epi16(in[1], in[14]); - i[2] = _mm_add_epi16(in[2], in[13]); - i[3] = _mm_add_epi16(in[3], in[12]); - i[4] = _mm_add_epi16(in[4], in[11]); - i[5] = _mm_add_epi16(in[5], in[10]); - i[6] = _mm_add_epi16(in[6], in[9]); - i[7] = _mm_add_epi16(in[7], in[8]); - - s[0] = _mm_sub_epi16(in[7], in[8]); - s[1] = _mm_sub_epi16(in[6], in[9]); - s[2] = _mm_sub_epi16(in[5], in[10]); - s[3] = _mm_sub_epi16(in[4], in[11]); - s[4] = _mm_sub_epi16(in[3], in[12]); - s[5] = _mm_sub_epi16(in[2], in[13]); - s[6] = _mm_sub_epi16(in[1], in[14]); - s[7] = _mm_sub_epi16(in[0], in[15]); - - p[0] = _mm_add_epi16(i[0], i[7]); - p[1] = _mm_add_epi16(i[1], i[6]); - p[2] = _mm_add_epi16(i[2], i[5]); - p[3] = _mm_add_epi16(i[3], i[4]); - p[4] = _mm_sub_epi16(i[3], i[4]); - p[5] = _mm_sub_epi16(i[2], i[5]); - p[6] = _mm_sub_epi16(i[1], i[6]); - p[7] = _mm_sub_epi16(i[0], i[7]); - - u[0] = _mm_add_epi16(p[0], p[3]); - u[1] = _mm_add_epi16(p[1], p[2]); - u[2] = _mm_sub_epi16(p[1], p[2]); - u[3] = _mm_sub_epi16(p[0], p[3]); - - v[0] = _mm_unpacklo_epi16(u[0], u[1]); - v[1] = _mm_unpackhi_epi16(u[0], u[1]); - v[2] = _mm_unpacklo_epi16(u[2], u[3]); - v[3] = _mm_unpackhi_epi16(u[2], u[3]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); - u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); - u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); - u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); - u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); - u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); - u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); - u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[4] = _mm_packs_epi32(u[4], u[5]); - in[8] = _mm_packs_epi32(u[2], u[3]); - in[12] = _mm_packs_epi32(u[6], u[7]); - - u[0] = _mm_unpacklo_epi16(p[5], p[6]); - u[1] = _mm_unpackhi_epi16(p[5], p[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[2], v[3]); - - t[0] = _mm_add_epi16(p[4], u[0]); - t[1] = _mm_sub_epi16(p[4], u[0]); - t[2] = _mm_sub_epi16(p[7], u[1]); - t[3] = _mm_add_epi16(p[7], u[1]); - - u[0] = _mm_unpacklo_epi16(t[0], t[3]); - u[1] = _mm_unpackhi_epi16(t[0], t[3]); - u[2] = _mm_unpacklo_epi16(t[1], t[2]); - u[3] = _mm_unpackhi_epi16(t[1], t[2]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); - v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); - v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); - v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); - v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); - v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - in[2] = _mm_packs_epi32(v[0], v[1]); - in[6] = _mm_packs_epi32(v[4], v[5]); - in[10] = _mm_packs_epi32(v[2], v[3]); - in[14] = _mm_packs_epi32(v[6], v[7]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[2], s[5]); - u[1] = _mm_unpackhi_epi16(s[2], s[5]); - u[2] = _mm_unpacklo_epi16(s[3], s[4]); - u[3] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[2] = _mm_packs_epi32(v[0], v[1]); - t[3] = _mm_packs_epi32(v[2], v[3]); - t[4] = _mm_packs_epi32(v[4], v[5]); - t[5] = _mm_packs_epi32(v[6], v[7]); - - // stage 3 - p[0] = _mm_add_epi16(s[0], t[3]); - p[1] = _mm_add_epi16(s[1], t[2]); - p[2] = _mm_sub_epi16(s[1], t[2]); - p[3] = _mm_sub_epi16(s[0], t[3]); - p[4] = _mm_sub_epi16(s[7], t[4]); - p[5] = _mm_sub_epi16(s[6], t[5]); - p[6] = _mm_add_epi16(s[6], t[5]); - p[7] = _mm_add_epi16(s[7], t[4]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(p[1], p[6]); - u[1] = _mm_unpackhi_epi16(p[1], p[6]); - u[2] = _mm_unpacklo_epi16(p[2], p[5]); - u[3] = _mm_unpackhi_epi16(p[2], p[5]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); - v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); - v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[1] = _mm_packs_epi32(v[0], v[1]); - t[2] = _mm_packs_epi32(v[2], v[3]); - t[5] = _mm_packs_epi32(v[4], v[5]); - t[6] = _mm_packs_epi32(v[6], v[7]); - - // stage 5 - s[0] = _mm_add_epi16(p[0], t[1]); - s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); - s[6] = _mm_sub_epi16(p[7], t[6]); - s[7] = _mm_add_epi16(p[7], t[6]); - - // stage 6 - u[0] = _mm_unpacklo_epi16(s[0], s[7]); - u[1] = _mm_unpackhi_epi16(s[0], s[7]); - u[2] = _mm_unpacklo_epi16(s[1], s[6]); - u[3] = _mm_unpackhi_epi16(s[1], s[6]); - u[4] = _mm_unpacklo_epi16(s[2], s[5]); - u[5] = _mm_unpackhi_epi16(s[2], s[5]); - u[6] = _mm_unpacklo_epi16(s[3], s[4]); - u[7] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); - v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); - v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); - v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); - v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); - v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); - v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); - v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); - v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); - v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); - v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); - v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); - v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); - v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v[0], v[1]); - in[9] = _mm_packs_epi32(v[2], v[3]); - in[5] = _mm_packs_epi32(v[4], v[5]); - in[13] = _mm_packs_epi32(v[6], v[7]); - in[3] = _mm_packs_epi32(v[8], v[9]); - in[11] = _mm_packs_epi32(v[10], v[11]); - in[7] = _mm_packs_epi32(v[12], v[13]); - in[15] = _mm_packs_epi32(v[14], v[15]); -} - -void fadst16_8col_avx2(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -void fdct16_avx2(__m128i *in0, __m128i *in1) { - fdct16_8col_avx2(in0); - fdct16_8col_avx2(in1); - array_transpose_16x16_avx2(in0, in1); -} - -void fadst16_avx2(__m128i *in0, __m128i *in1) { - fadst16_8col_avx2(in0); - fadst16_8col_avx2(in1); - array_transpose_16x16_avx2(in0, in1); -} - -void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in0[16], in1[16]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct16x16_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_16x16_avx2(input, in0, in1, stride); - fadst16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fdct16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - case DCT_ADST: - load_buffer_16x16_avx2(input, in0, in1, stride); - fdct16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fadst16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - case ADST_ADST: - load_buffer_16x16_avx2(input, in0, in1, stride); - fadst16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fadst16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - default: - assert(0); - break; - } -} #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm new file mode 100644 index 000000000..f71181c5e --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm @@ -0,0 +1,70 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + movq m4, m0 + movq m5, m2 + punpcklwd m4, m1 + punpckhwd m0, m1 + punpcklwd m5, m3 + punpckhwd m2, m3 + movq m1, m4 + movq m3, m0 + punpckldq m1, m5 + punpckhdq m4, m5 + punpckldq m3, m2 + punpckhdq m0, m2 + SWAP 2, 3, 0, 1, 4 +%endmacro + +INIT_MMX mmx +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + + movq [outputq], m0 + movq [outputq + 8], m1 + movq [outputq + 16], m2 + movq [outputq + 24], m3 + + RET diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index 686582238..487deef42 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,6 +12,35 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) + (input + 2 * stride))); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in0); +} + void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given @@ -377,6 +406,46 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + _mm_store_si128((__m128i *)(output), in1); +} + void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { int pass; // Constants @@ -1168,6 +1237,74 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + input += 8 * i; + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in1); +} + void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -1187,7 +1324,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -1513,8 +1650,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); @@ -1535,8 +1672,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); @@ -1554,10 +1691,10 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); - step1_2 = _mm_sub_epi16(step3_3, step2_2); - step1_3 = _mm_add_epi16(step3_3, step2_2); - step1_4 = _mm_add_epi16(step3_4, step2_5); - step1_5 = _mm_sub_epi16(step3_4, step2_5); + step1_2 = _mm_add_epi16(step3_3, step2_2); + step1_3 = _mm_sub_epi16(step3_3, step2_2); + step1_4 = _mm_sub_epi16(step3_4, step2_5); + step1_5 = _mm_add_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } @@ -1848,7 +1985,7 @@ void fdct16_8col(__m128i *in) { const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -2052,10 +2189,10 @@ void fdct16_8col(__m128i *in) { v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); + v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); + v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); + v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); @@ -2085,10 +2222,10 @@ void fdct16_8col(__m128i *in) { // stage 5 s[0] = _mm_add_epi16(p[0], t[1]); s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); + s[2] = _mm_add_epi16(p[3], t[2]); + s[3] = _mm_sub_epi16(p[3], t[2]); + s[4] = _mm_sub_epi16(p[4], t[5]); + s[5] = _mm_add_epi16(p[4], t[5]); s[6] = _mm_sub_epi16(p[7], t[6]); s[7] = _mm_add_epi16(p[7], t[6]); @@ -2680,6 +2817,77 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + _mm_store_si128((__m128i *)(output), in1); +} + #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm new file mode 100644 index 000000000..28458dcdd --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm @@ -0,0 +1,182 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the forward transformation. Part +; of the macro definitions are originally derived from the ffmpeg project. +; The current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +; 1D forward 8x8 DCT transform +%macro FDCT8_1D 1 + SUM_SUB 0, 7, 9 + SUM_SUB 1, 6, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 3, 4, 9 + + SUM_SUB 0, 3, 9 + SUM_SUB 1, 2, 9 + SUM_SUB 6, 5, 9 +%if %1 == 0 + SUM_SUB 0, 1, 9 +%endif + + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 + + pmulhrsw m6, m12 + pmulhrsw m5, m12 +%if %1 == 0 + pmulhrsw m0, m12 + pmulhrsw m1, m12 +%else + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 + SWAP 0, 1 +%endif + + SUM_SUB 4, 5, 9 + SUM_SUB 7, 6, 9 + BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 + SWAP 1, 4 + SWAP 3, 6 +%endmacro + +%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 + psraw m%3, m%1, 15 + psraw m%4, m%2, 15 + psubw m%1, m%3 + psubw m%2, m%4 + psraw m%1, 1 + psraw m%2, 1 +%endmacro + +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [pd_8192] + mova m12, [pw_11585x2] + pxor m11, m11 + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + FDCT8_1D 0 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + FDCT8_1D 1 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + DIVIDE_ROUND_2X 0, 1, 9, 10 + DIVIDE_ROUND_2X 2, 3, 9, 10 + DIVIDE_ROUND_2X 4, 5, 9, 10 + DIVIDE_ROUND_2X 6, 7, 9, 10 + + mova [outputq + 0], m0 + mova [outputq + 16], m1 + mova [outputq + 32], m2 + mova [outputq + 48], m3 + mova [outputq + 64], m4 + mova [outputq + 80], m5 + mova [outputq + 96], m6 + mova [outputq + 112], m7 + + RET +%endif diff --git a/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c new file mode 100644 index 000000000..c67490fad --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Usee of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> // AVX2 +#include "vpx/vpx_integer.h" + + +int64_t vp9_block_error_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + intptr_t block_size, + int64_t *ssz) { + __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_reg_64hi, ssz_reg_64hi; + __m128i sse_reg128, ssz_reg128; + int64_t sse; + int i; + const __m256i zero_reg = _mm256_set1_epi16(0); + + // init sse and ssz registerd to zero + sse_reg = _mm256_set1_epi16(0); + ssz_reg = _mm256_set1_epi16(0); + + for (i = 0 ; i < block_size ; i+= 16) { + // load 32 bytes from coeff and dqcoeff + coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); + dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); + // dqcoeff - coeff + dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); + // madd (dqcoeff - coeff) + dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); + // madd coeff + coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); + // expand each double word of madd (dqcoeff - coeff) to quad word + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); + // add each quad word of madd (dqcoeff - coeff) and madd (coeff) + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } + // save the higher 64 bit of each 128 bit lane + sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); + ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); + // add the higher 64 bit to the low 64 bit + sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); + ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); + + // add each 64 bit from each of the 128 bit lane of the 256 bit + sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), + _mm256_extractf128_si256(sse_reg, 1)); + + ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), + _mm256_extractf128_si256(ssz_reg, 1)); + + // store the results + _mm_storel_epi64((__m128i*)(&sse), sse_reg128); + + _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); + return sse; +} diff --git a/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h b/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h deleted file mode 100644 index c15039ad8..000000000 --- a/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_X86_VP9_MCOMP_X86_H_ -#define VP9_ENCODER_X86_VP9_MCOMP_X86_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_SSE3 -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_search_full_search -#define vp9_search_full_search vp9_full_search_sadx3 - -#undef vp9_search_refining_search -#define vp9_search_refining_search vp9_refining_search_sadx4 - -#undef vp9_search_diamond_search -#define vp9_search_diamond_search vp9_diamond_search_sadx4 - -#endif -#endif - -#if HAVE_SSE4_1 -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_search_full_search -#define vp9_search_full_search vp9_full_search_sadx8 - -#endif -#endif - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_ENCODER_X86_VP9_MCOMP_X86_H_ - diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 48ccef8cc..508e1d4f5 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -217,3 +217,186 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ INIT_XMM ssse3 QUANTIZE_FN b, 7 QUANTIZE_FN b_32x32, 7 + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, fp_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m1, m5 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, fp_32x32 + psllw m2, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob + lea coeffq, [ coeffq+ncoeffq*2] + lea iscanq, [ iscanq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, fp_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 + psrlw m0, m3, 2 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) +%ifidn %1, fp_32x32 + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 + pmovmskb r6, m7 + pmovmskb r2, m12 + + or r6, r2 + jz .skip_iter +%endif + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, fp_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, fp_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [dqcoeffq+ncoeffq*2+ 0], m7 + mova [dqcoeffq+ncoeffq*2+16], m7 + mova [qcoeffq+ncoeffq*2+ 0], m7 + mova [qcoeffq+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [eobq], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7 diff --git a/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c index f31b176e5..1feed6256 100644 --- a/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c +++ b/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c @@ -31,7 +31,7 @@ void vp9_sad32x32x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 32 ; i++) { // load src and all refs - src_reg = _mm256_load_si256((__m256i *)(src)); + src_reg = _mm256_loadu_si256((__m256i *)(src)); ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); @@ -103,8 +103,8 @@ void vp9_sad64x64x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 64 ; i++) { // load 64 bytes from src and all refs - src_reg = _mm256_load_si256((__m256i *)(src)); - srcnext_reg = _mm256_load_si256((__m256i *)(src + 32)); + src_reg = _mm256_loadu_si256((__m256i *)(src)); + srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); diff --git a/libvpx/vp9/encoder/x86/vp9_ssim_opt.asm b/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm index 455d10d2c..455d10d2c 100644 --- a/libvpx/vp9/encoder/x86/vp9_ssim_opt.asm +++ b/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c index 34ed1867f..9aa4da962 100644 --- a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c +++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -67,7 +67,7 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { #define LOAD_SRC_DST \ /* load source and destination */ \ src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ - dst_reg = _mm256_load_si256((__m256i const *) (dst)); + dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); #define AVG_NEXT_SRC(src_reg, size_stride) \ src_next_reg = _mm256_loadu_si256((__m256i const *) \ diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm deleted file mode 100644 index 2ecc23e55..000000000 --- a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm +++ /dev/null @@ -1,337 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_half_horiz_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(vp9_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -.half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz .half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE -sym(vp9_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr - - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -.half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz .half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_horiz_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE -sym(vp9_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - -.half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz .half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm index d2d13b383..21aaa9383 100644 --- a/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ b/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm @@ -15,41 +15,45 @@ ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 ; unsigned char *frame2, | 2 -; unsigned int block_size, | 3 -; int strength, | 4 -; int filter_weight, | 5 -; unsigned int *accumulator, | 6 -; unsigned short *count) | 7 +; unsigned int block_width, | 3 +; unsigned int block_height, | 4 +; int strength, | 5 +; int filter_weight, | 6 +; unsigned int *accumulator, | 7 +; unsigned short *count) | 8 global sym(vp9_temporal_filter_apply_sse2) PRIVATE sym(vp9_temporal_filter_apply_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 + SHADOW_ARGS_TO_STACK 9 SAVE_XMM 7 GET_GOT rbx push rsi push rdi ALIGN_STACK 16, rax - %define block_size 0 - %define strength 16 - %define filter_weight 32 - %define rounding_bit 48 - %define rbp_backup 64 - %define stack_size 80 + %define block_width 0 + %define block_height 16 + %define strength 32 + %define filter_weight 48 + %define rounding_bit 64 + %define rbp_backup 80 + %define stack_size 96 sub rsp, stack_size mov [rsp + rbp_backup], rbp ; end prolog - mov rdx, arg(3) - mov [rsp + block_size], rdx - movd xmm6, arg(4) + mov edx, arg(3) + mov [rsp + block_width], rdx + mov edx, arg(4) + mov [rsp + block_height], rdx + movd xmm6, arg(5) movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read ; calculate the rounding bit outside the loop ; 0x8000 >> (16 - strength) mov rdx, 16 - sub rdx, arg(4) ; 16 - strength + sub rdx, arg(5) ; 16 - strength movq xmm4, rdx ; can't use rdx w/ shift movdqa xmm5, [GLOBAL(_const_top_bit)] psrlw xmm5, xmm4 @@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2): mov rsi, arg(0) ; src/frame1 mov rdx, arg(2) ; predictor frame - mov rdi, arg(6) ; accumulator - mov rax, arg(7) ; count + mov rdi, arg(7) ; accumulator + mov rax, arg(8) ; count ; dup the filter weight and store for later - movd xmm0, arg(5) ; filter_weight + movd xmm0, arg(6) ; filter_weight pshuflw xmm0, xmm0, 0 punpcklwd xmm0, xmm0 movdqa [rsp + filter_weight], xmm0 @@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2): mov rbp, arg(1) ; stride pxor xmm7, xmm7 ; zero for extraction - lea rcx, [rdx + 16*16*1] - cmp dword ptr [rsp + block_size], 8 + mov rcx, [rsp + block_width] + imul rcx, [rsp + block_height] + add rcx, rdx + cmp dword ptr [rsp + block_width], 8 jne .temporal_filter_apply_load_16 - lea rcx, [rdx + 8*8*1] .temporal_filter_apply_load_8: movq xmm0, [rsi] ; first row @@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2): cmp rdx, rcx je .temporal_filter_apply_epilog pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_size], 16 + cmp dword ptr [rsp + block_width], 16 je .temporal_filter_apply_load_16 jmp .temporal_filter_apply_load_8 diff --git a/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/libvpx/vp9/encoder/x86/vp9_variance_avx2.c index 835c51957..7f81f46b8 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_avx2.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_avx2.c @@ -10,7 +10,6 @@ #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" typedef void (*get_var_avx2) ( diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm index 2c5088134..483041278 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm @@ -398,337 +398,4 @@ sym(vp9_get8x8var_sse2): pop rbp ret -;void vp9_half_horiz_vert_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE -sym(vp9_half_horiz_vert_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source -%else - add rsi, r8 -%endif - -.half_horiz_vert_variance8x_h_1: - - movq xmm1, QWORD PTR [rsi] ; - movq xmm2, QWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - - sub rcx, 1 ; - jnz .half_horiz_vert_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_vert_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE -sym(vp9_half_vert_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; -.half_vert_variance8x_h_1: - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .half_vert_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_half_horiz_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE -sym(vp9_half_horiz_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor xmm0, xmm0 ; -.half_horiz_variance8x_h_1: - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .half_horiz_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c index c4d17fc0f..ce1c83297 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c @@ -10,144 +10,94 @@ #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern unsigned int vp9_get8x8var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -extern unsigned int vp9_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp9_variance4x4_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} +unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); -unsigned int vp9_variance8x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; +unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *SSE, int *sum); - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg); - *sse = var; +unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); +} - return (var - (((unsigned int)avg * avg) >> 6)); +unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); } -unsigned int vp9_mse16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3, var; +unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3; int sum0, sum1, sum2, sum3; + vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); + vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); + vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, + ref + 8 * ref_stride, ref_stride, &sse2, &sum2); + vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride, + ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3); - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, - &sse1, &sum1); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, - ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, - ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - *sse = var; - return var; + *sse = sse0 + sse1 + sse2 + sse3; + return *sse; } -unsigned int vp9_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, - &sse1, &sum1); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, - ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, - ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} +unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3; + int sum0, sum1, sum2, sum3, sum; -unsigned int vp9_variance16x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, - &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); + vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); + vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); + vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, + ref + 8 * ref_stride, ref_stride, &sse2, &sum2); + vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride, + ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3); + + *sse = sse0 + sse1 + sse2 + sse3; + sum = sum0 + sum1 + sum2 + sum3; + return *sse - (((unsigned int)sum * sum) >> 8); } +unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + unsigned int sse0, sse1; + int sum0, sum1, sum; + + vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); + vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); + + *sse = sse0 + sse1; + sum = sum0 + sum1; + return *sse - (((unsigned int)sum * sum) >> 7); +} -unsigned int vp9_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, - ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1); +unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + unsigned int sse0, sse1; + int sum0, sum1, sum; - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; + vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); + vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, + ref + 8 * ref_stride, ref_stride, &sse1, &sum1); - return (var - (((unsigned int)avg * avg) >> 7)); + *sse = sse0 + sse1; + sum = sum0 + sum1; + return *sse - (((unsigned int)sum * sum) >> 7); } diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index 9e65694a8..e935a233a 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -11,113 +11,29 @@ #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern unsigned int vp9_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp9_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -unsigned int vp9_get8x8var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -void vp9_half_horiz_vert_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_horiz_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_vert_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); - -typedef unsigned int (*get_var_sse2) ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -static void variance_sse2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_sse2 var_fn, int block_size) { - unsigned int sse0; - int sum0; +typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + + +unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + variance_fn_t var_fn, int block_size) { int i, j; *sse = 0; @@ -125,217 +41,139 @@ static void variance_sse2(const unsigned char *src_ptr, int source_stride, for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { - var_fn(src_ptr + source_stride * i + j, source_stride, - ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } } } -unsigned int vp9_variance4x4_sse2( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, - &var, &avg, vp9_get4x4var_mmx, 4); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); +unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 4, + sse, &sum, vp9_get4x4var_mmx, 4); + return *sse - (((unsigned int)sum * sum) >> 4); } -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, - &var, &avg, vp9_get4x4var_mmx, 4); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, vp9_get4x4var_mmx, 4); + return *sse - (((unsigned int)sum * sum) >> 5); } -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, - &var, &avg, vp9_get4x4var_mmx, 4); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, vp9_get4x4var_mmx, 4); + return *sse - (((unsigned int)sum * sum) >> 5); } -unsigned int vp9_variance8x8_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, - &var, &avg, vp9_get8x8var_sse2, 8); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); +unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 6); } -unsigned int vp9_variance16x8_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, - &var, &avg, vp9_get8x8var_sse2, 8); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); +unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vp9_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); } -unsigned int vp9_variance8x16_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, - &var, &avg, vp9_get8x8var_sse2, 8); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); +unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vp9_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); } -unsigned int vp9_variance16x16_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); +unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); } -unsigned int vp9_mse16x16_sse2( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0; - int sum0; - vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - *sse = sse0; - return sse0; +unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; } -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 10)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); } -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); } -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); } -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 12)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); } -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); } -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); } #define DECL(w, opt) \ @@ -494,58 +332,3 @@ FNS(ssse3, ssse3); #undef FNS #undef FN - -unsigned int vp9_variance_halfpixvar16x16_h_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int xsum0; - unsigned int xxsum0; - - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - - -unsigned int vp9_variance_halfpixvar16x16_v_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int xsum0; - unsigned int xxsum0; - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - - -unsigned int vp9_variance_halfpixvar16x16_hv_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int xsum0; - unsigned int xxsum0; - - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk index b1ba0b133..8e3e88522 100644 --- a/libvpx/vp9/vp9_common.mk +++ b/libvpx/vp9/vp9_common.mk @@ -10,7 +10,6 @@ VP9_COMMON_SRCS-yes += vp9_common.mk VP9_COMMON_SRCS-yes += vp9_iface_common.h -VP9_COMMON_SRCS-yes += common/vp9_pragmas.h VP9_COMMON_SRCS-yes += common/vp9_ppflags.h VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c VP9_COMMON_SRCS-yes += common/vp9_blockd.c @@ -51,6 +50,8 @@ VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h VP9_COMMON_SRCS-yes += common/vp9_textblit.h +VP9_COMMON_SRCS-yes += common/vp9_thread.h +VP9_COMMON_SRCS-yes += common/vp9_thread.c VP9_COMMON_SRCS-yes += common/vp9_tile_common.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c @@ -66,7 +67,6 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c VP9_COMMON_SRCS-yes += common/vp9_scan.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c @@ -119,29 +119,34 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c +ifeq ($(ARCH_X86_64), yes) +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm +endif -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct4x4_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct8x8_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index 152e1f46e..bf8eec717 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -11,10 +11,11 @@ #include <stdlib.h> #include <string.h> +#include "./vpx_config.h" #include "vpx/vpx_codec.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vpx/vp8cx.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_iface_common.h" @@ -39,10 +40,11 @@ struct vp9_extracfg { AQ_MODE aq_mode; unsigned int frame_periodic_boost; BIT_DEPTH bit_depth; + vp9e_tune_content content; }; struct extraconfig_map { - int usage; + unsigned int usage; struct vp9_extracfg cfg; }; @@ -69,6 +71,7 @@ static const struct extraconfig_map extracfg_map[] = { NO_AQ, // aq_mode 0, // frame_periodic_delta_q BITS_8, // Bit depth + VP9E_CONTENT_DEFAULT // content } } }; @@ -77,7 +80,7 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp9_extracfg extra_cfg; - VP9_CONFIG oxcf; + VP9EncoderConfig oxcf; VP9_COMP *cpi; unsigned char *cx_data; size_t cx_data_sz; @@ -88,8 +91,8 @@ struct vpx_codec_alg_priv { size_t pending_frame_magnitude; vpx_image_t preview_img; vp8_postproc_cfg_t preview_ppcfg; - vpx_codec_pkt_list_decl(64) pkt_list; - unsigned int fixed_kf_cntr; + vpx_codec_pkt_list_decl(256) pkt_list; + unsigned int fixed_kf_cntr; }; static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { @@ -168,7 +171,25 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); + if (cfg->rc_resize_allowed == 1) { + RANGE_CHECK(cfg, rc_scaled_width, 1, cfg->g_w); + RANGE_CHECK(cfg, rc_scaled_height, 1, cfg->g_h); + } + RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); + +#if CONFIG_SPATIAL_SVC + if (cfg->ss_number_layers > 1) { + unsigned int i, alt_ref_sum = 0; + for (i = 0; i < cfg->ss_number_layers; ++i) { + if (cfg->ss_enable_auto_alt_ref[i]) + ++alt_ref_sum; + } + if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers) + ERROR("Not enough ref buffers for svc alt ref frames"); + } +#endif + RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS); if (cfg->ts_number_layers > 1) { unsigned int i; @@ -182,7 +203,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("ts_rate_decimator factors are not powers of 2"); } - // VP8 does not support a lower bound on the keyframe interval in + // VP9 does not support a lower bound on the keyframe interval in // automatic keyframe placement mode. if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && @@ -200,14 +221,16 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); RANGE_CHECK(extra_cfg, arnr_type, 1, 3); RANGE_CHECK(extra_cfg, cq_level, 0, 63); + RANGE_CHECK(extra_cfg, content, + VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1); // TODO(yaowu): remove this when ssim tuning is implemented for vp9 if (extra_cfg->tuning == VP8_TUNE_SSIM) ERROR("Option --tune=ssim is not currently supported in VP9."); if (cfg->g_pass == VPX_RC_LAST_PASS) { - size_t packet_sz = sizeof(FIRSTPASS_STATS); - int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); const FIRSTPASS_STATS *stats; if (cfg->rc_twopass_stats_in.buf == NULL) @@ -240,7 +263,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, layer_id = (int)stats->spatial_layer_id; if (layer_id >= cfg->ss_number_layers - ||(int)(stats->count + 0.5) != n_packets_per_layer[layer_id] - 1) + ||(unsigned int)(stats->count + 0.5) != + n_packets_per_layer[layer_id] - 1) ERROR("rc_twopass_stats_in missing EOS stats packet"); } } else { @@ -254,6 +278,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("rc_twopass_stats_in missing EOS stats packet"); } } + if (cfg->g_profile <= (unsigned int)PROFILE_1 && extra_cfg->bit_depth > BITS_8) ERROR("High bit-depth not supported in profile < 2"); @@ -276,6 +301,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, default: ERROR("Invalid image format. Only YV12, I420, I422, I444 images are " "supported."); + break; } if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) @@ -284,9 +310,19 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static int get_image_bps(const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I420: return 12; + case VPX_IMG_FMT_I422: return 16; + case VPX_IMG_FMT_I444: return 24; + default: assert(0 && "Invalid image format"); break; + } + return 0; +} static vpx_codec_err_t set_encoder_config( - VP9_CONFIG *oxcf, + VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { oxcf->profile = cfg->g_profile; @@ -300,41 +336,44 @@ static vpx_codec_err_t set_encoder_config( switch (cfg->g_pass) { case VPX_RC_ONE_PASS: - oxcf->mode = MODE_GOODQUALITY; + oxcf->mode = ONE_PASS_GOOD; + oxcf->pass = 0; break; case VPX_RC_FIRST_PASS: - oxcf->mode = MODE_FIRSTPASS; + oxcf->mode = TWO_PASS_FIRST; + oxcf->pass = 1; break; case VPX_RC_LAST_PASS: - oxcf->mode = MODE_SECONDPASS_BEST; + oxcf->mode = TWO_PASS_SECOND_BEST; + oxcf->pass = 2; break; } oxcf->lag_in_frames = cfg->g_pass == VPX_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames; + oxcf->rc_mode = cfg->rc_end_usage; - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; - if (cfg->rc_end_usage == VPX_CQ) - oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; - else if (cfg->rc_end_usage == VPX_Q) - oxcf->end_usage = USAGE_CONSTANT_QUALITY; - else if (cfg->rc_end_usage == VPX_CBR) - oxcf->end_usage = USAGE_STREAM_FROM_SERVER; - - oxcf->target_bandwidth = cfg->rc_target_bitrate; + // Convert target bandwidth from Kbit/s to Bit/s + oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; - oxcf->best_allowed_q = q_trans[cfg->rc_min_quantizer]; - oxcf->worst_allowed_q = q_trans[cfg->rc_max_quantizer]; - oxcf->cq_level = q_trans[extra_cfg->cq_level]; + oxcf->best_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer); + oxcf->worst_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer); + oxcf->cq_level = vp9_quantizer_to_qindex(extra_cfg->cq_level); oxcf->fixed_q = -1; oxcf->under_shoot_pct = cfg->rc_undershoot_pct; oxcf->over_shoot_pct = cfg->rc_overshoot_pct; - oxcf->maximum_buffer_size = cfg->rc_buf_sz; - oxcf->starting_buffer_level = cfg->rc_buf_initial_sz; - oxcf->optimal_buffer_level = cfg->rc_buf_optimal_sz; + oxcf->allow_spatial_resampling = cfg->rc_resize_allowed; + oxcf->scaled_frame_width = cfg->rc_scaled_width; + oxcf->scaled_frame_height = cfg->rc_scaled_height; + + oxcf->maximum_buffer_size_ms = cfg->rc_buf_sz; + oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz; + oxcf->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz; oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh; @@ -347,7 +386,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->key_freq = cfg->kf_max_dist; - oxcf->cpu_used = extra_cfg->cpu_used; + oxcf->speed = abs(extra_cfg->cpu_used); oxcf->encode_breakout = extra_cfg->static_thresh; oxcf->play_alternate = extra_cfg->enable_auto_alt_ref; oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; @@ -356,17 +395,20 @@ static vpx_codec_err_t set_encoder_config( oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in; oxcf->output_pkt_list = extra_cfg->pkt_list; +#if CONFIG_FP_MB_STATS + oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; +#endif + oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; oxcf->arnr_strength = extra_cfg->arnr_strength; oxcf->arnr_type = extra_cfg->arnr_type; oxcf->tuning = extra_cfg->tuning; + oxcf->content = extra_cfg->content; oxcf->tile_columns = extra_cfg->tile_columns; oxcf->tile_rows = extra_cfg->tile_rows; - oxcf->lossless = extra_cfg->lossless; - oxcf->error_resilient_mode = cfg->g_error_resilient; oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode; @@ -377,7 +419,13 @@ static vpx_codec_err_t set_encoder_config( oxcf->ss_number_layers = cfg->ss_number_layers; if (oxcf->ss_number_layers > 1) { - vp9_copy(oxcf->ss_target_bitrate, cfg->ss_target_bitrate); + int i; + for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + oxcf->ss_target_bitrate[i] = 1000 * cfg->ss_target_bitrate[i]; +#if CONFIG_SPATIAL_SVC + oxcf->ss_play_alternate[i] = cfg->ss_enable_auto_alt_ref[i]; +#endif + } } else if (oxcf->ss_number_layers == 1) { oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; } @@ -385,8 +433,11 @@ static vpx_codec_err_t set_encoder_config( oxcf->ts_number_layers = cfg->ts_number_layers; if (oxcf->ts_number_layers > 1) { - vp9_copy(oxcf->ts_target_bitrate, cfg->ts_target_bitrate); - vp9_copy(oxcf->ts_rate_decimator, cfg->ts_rate_decimator); + int i; + for (i = 0; i < VPX_TS_MAX_LAYERS; ++i) { + oxcf->ts_target_bitrate[i] = 1000 * cfg->ts_target_bitrate[i]; + oxcf->ts_rate_decimator[i] = cfg->ts_rate_decimator[i]; + } } else if (oxcf->ts_number_layers == 1) { oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth; oxcf->ts_rate_decimator[0] = 1; @@ -410,6 +461,9 @@ static vpx_codec_err_t set_encoder_config( printf("fixed_q: %d\n", oxcf->fixed_q); printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q); printf("best_allowed_q: %d\n", oxcf->best_allowed_q); + printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling); + printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width); + printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height); printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); @@ -449,79 +503,168 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, return res; } +static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) + return VPX_CODEC_INVALID_PARAM; + *arg = vp9_get_quantizer(ctx->cpi); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) + return VPX_CODEC_INVALID_PARAM; + *arg = vp9_qindex_to_quantizer(vp9_get_quantizer(ctx->cpi)); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, + const struct vp9_extracfg *extra_cfg) { + const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); + if (res == VPX_CODEC_OK) { + ctx->extra_cfg = *extra_cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + vp9_change_config(ctx->cpi, &ctx->oxcf); + } + return res; +} + +static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + return update_extra_cfg(ctx, &extra_cfg); +} -int vp9_reverse_trans(int q); +static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args); + return update_extra_cfg(ctx, &extra_cfg); +} -static vpx_codec_err_t ctrl_get_param(vpx_codec_alg_priv_t *ctx, int ctrl_id, - va_list args) { - void *arg = va_arg(args, void *); +static vpx_codec_err_t ctrl_set_sharpness(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sharpness = CAST(VP8E_SET_SHARPNESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} -#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break +static vpx_codec_err_t ctrl_set_static_thresh(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args); + return update_extra_cfg(ctx, &extra_cfg); +} - if (arg == NULL) - return VPX_CODEC_INVALID_PARAM; +static vpx_codec_err_t ctrl_set_tile_columns(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_columns = CAST(VP9E_SET_TILE_COLUMNS, args); + return update_extra_cfg(ctx, &extra_cfg); +} - switch (ctrl_id) { - MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi)); - MAP(VP8E_GET_LAST_QUANTIZER_64, - vp9_reverse_trans(vp9_get_quantizer(ctx->cpi))); - } +static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_rows = CAST(VP9E_SET_TILE_ROWS, args); + return update_extra_cfg(ctx, &extra_cfg); +} - return VPX_CODEC_OK; -#undef MAP +static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args); + return update_extra_cfg(ctx, &extra_cfg); +} -static vpx_codec_err_t ctrl_set_param(vpx_codec_alg_priv_t *ctx, int ctrl_id, - va_list args) { - vpx_codec_err_t res = VPX_CODEC_OK; +static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx, + va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args); + return update_extra_cfg(ctx, &extra_cfg); +} -#define MAP(id, var) case id: var = CAST(id, args); break; - - switch (ctrl_id) { - MAP(VP8E_SET_CPUUSED, extra_cfg.cpu_used); - MAP(VP8E_SET_ENABLEAUTOALTREF, extra_cfg.enable_auto_alt_ref); - MAP(VP8E_SET_NOISE_SENSITIVITY, extra_cfg.noise_sensitivity); - MAP(VP8E_SET_SHARPNESS, extra_cfg.sharpness); - MAP(VP8E_SET_STATIC_THRESHOLD, extra_cfg.static_thresh); - MAP(VP9E_SET_TILE_COLUMNS, extra_cfg.tile_columns); - MAP(VP9E_SET_TILE_ROWS, extra_cfg.tile_rows); - MAP(VP8E_SET_ARNR_MAXFRAMES, extra_cfg.arnr_max_frames); - MAP(VP8E_SET_ARNR_STRENGTH, extra_cfg.arnr_strength); - MAP(VP8E_SET_ARNR_TYPE, extra_cfg.arnr_type); - MAP(VP8E_SET_TUNING, extra_cfg.tuning); - MAP(VP8E_SET_CQ_LEVEL, extra_cfg.cq_level); - MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, extra_cfg.rc_max_intra_bitrate_pct); - MAP(VP9E_SET_LOSSLESS, extra_cfg.lossless); - MAP(VP9E_SET_FRAME_PARALLEL_DECODING, - extra_cfg.frame_parallel_decoding_mode); - MAP(VP9E_SET_AQ_MODE, extra_cfg.aq_mode); - MAP(VP9E_SET_FRAME_PERIODIC_BOOST, extra_cfg.frame_periodic_boost); - } +static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tuning = CAST(VP8E_SET_TUNING, args); + return update_extra_cfg(ctx, &extra_cfg); +} - res = validate_config(ctx, &ctx->cfg, &extra_cfg); +static vpx_codec_err_t ctrl_set_cq_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} - if (res == VPX_CODEC_OK) { - ctx->extra_cfg = extra_cfg; - set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - vp9_change_config(ctx->cpi, &ctx->oxcf); - } +static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} - return res; -#undef MAP +static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.lossless = CAST(VP9E_SET_LOSSLESS, args); + return update_extra_cfg(ctx, &extra_cfg); } -static vpx_codec_err_t encoder_common_init(vpx_codec_ctx_t *ctx) { +static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_parallel_decoding_mode = + CAST(VP9E_SET_FRAME_PARALLEL_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_periodic_boost = CAST(VP9E_SET_FRAME_PERIODIC_BOOST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; + (void)data; if (ctx->priv == NULL) { int i; vpx_codec_enc_cfg_t *cfg; struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv)); - if (priv == NULL) return VPX_CODEC_MEM_ERROR; + if (priv == NULL) + return VPX_CODEC_MEM_ERROR; ctx->priv = &priv->base; ctx->priv->sz = sizeof(*ctx->priv); @@ -531,8 +674,7 @@ static vpx_codec_err_t encoder_common_init(vpx_codec_ctx_t *ctx) { ctx->priv->enc.total_encoders = 1; if (ctx->config.enc) { - // Update the reference to the config structure to an - // internal copy. + // Update the reference to the config structure to an internal copy. ctx->priv->alg_priv->cfg = *ctx->config.enc; ctx->config.enc = &ctx->priv->alg_priv->cfg; } @@ -549,15 +691,6 @@ static vpx_codec_err_t encoder_common_init(vpx_codec_ctx_t *ctx) { priv->extra_cfg = extracfg_map[i].cfg; priv->extra_cfg.pkt_list = &priv->pkt_list.head; - // Maximum buffer size approximated based on having multiple ARF. - priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8; - - if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096; - - priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz); - if (priv->cx_data == NULL) - return VPX_CODEC_MEM_ERROR; - vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); @@ -565,8 +698,8 @@ static vpx_codec_err_t encoder_common_init(vpx_codec_ctx_t *ctx) { if (res == VPX_CODEC_OK) { VP9_COMP *cpi; set_encoder_config(&ctx->priv->alg_priv->oxcf, - &ctx->priv->alg_priv->cfg, - &ctx->priv->alg_priv->extra_cfg); + &ctx->priv->alg_priv->cfg, + &ctx->priv->alg_priv->extra_cfg); cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf); if (cpi == NULL) res = VPX_CODEC_MEM_ERROR; @@ -578,12 +711,6 @@ static vpx_codec_err_t encoder_common_init(vpx_codec_ctx_t *ctx) { return res; } - -static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data) { - return encoder_common_init(ctx); -} - static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { free(ctx->cx_data); vp9_remove_compressor(ctx->cpi); @@ -595,7 +722,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, unsigned long duration, unsigned long deadline) { // Use best quality mode if no deadline is given. - MODE new_qc = MODE_BESTQUALITY; + MODE new_qc = ONE_PASS_BEST; if (deadline) { // Convert duration parameter from stream timebase to microseconds @@ -605,14 +732,14 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, // If the deadline is more that the duration this frame is to be shown, // use good quality mode. Otherwise use realtime mode. - new_qc = (deadline > duration_us) ? MODE_GOODQUALITY : MODE_REALTIME; + new_qc = (deadline > duration_us) ? ONE_PASS_GOOD : REALTIME; } if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS) - new_qc = MODE_FIRSTPASS; + new_qc = TWO_PASS_FIRST; else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS) - new_qc = (new_qc == MODE_BESTQUALITY) ? MODE_SECONDPASS_BEST - : MODE_SECONDPASS; + new_qc = (new_qc == ONE_PASS_BEST) ? TWO_PASS_SECOND_BEST + : TWO_PASS_SECOND_GOOD; if (ctx->oxcf.mode != new_qc) { ctx->oxcf.mode = new_qc; @@ -620,7 +747,8 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, } } - +// Turn on to test if supplemental superframe data breaks decoding +// #define TEST_SUPPLEMENTAL_SUPERFRAME_DATA static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { uint8_t marker = 0xc0; unsigned int mask; @@ -646,6 +774,20 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) { uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz; int i, j; +#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA + uint8_t marker_test = 0xc0; + int mag_test = 2; // 1 - 4 + int frames_test = 4; // 1 - 8 + int index_sz_test = 2 + mag_test * frames_test; + marker_test |= frames_test - 1; + marker_test |= (mag_test - 1) << 3; + *x++ = marker_test; + for (i = 0; i < mag_test * frames_test; ++i) + *x++ = 0; // fill up with arbitrary data + *x++ = marker_test; + ctx->pending_cx_data_sz += index_sz_test; + printf("Added supplemental superframe data\n"); +#endif *x++ = marker; for (i = 0; i < ctx->pending_frame_count; i++) { @@ -658,10 +800,27 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { } *x++ = marker; ctx->pending_cx_data_sz += index_sz; +#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA + index_sz += index_sz_test; +#endif } return index_sz; } +// vp9 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL + +static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase, + int64_t n) { + return n * TICKS_PER_SEC * timebase->num / timebase->den; +} + +static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase, + int64_t n) { + const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; + return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; +} + static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, @@ -669,9 +828,26 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_enc_frame_flags_t flags, unsigned long deadline) { vpx_codec_err_t res = VPX_CODEC_OK; + const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; - if (img) + if (img != NULL) { res = validate_img(ctx, img); + // TODO(jzern) the checks related to cpi's validity should be treated as a + // failure condition, encoder setup is done fully in init() currently. + if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) { + // There's no codec control for multiple alt-refs so check the encoder + // instance for its status to determine the compressed data size. + ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h * + get_image_bps(img) / 8 * + (ctx->cpi->multi_arf_allowed ? 8 : 2); + if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096; + + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + return VPX_CODEC_MEM_ERROR; + } + } + } pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); @@ -683,42 +859,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } - if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF | - VP8_EFLAG_NO_REF_ARF)) { - int ref = 7; - - if (flags & VP8_EFLAG_NO_REF_LAST) - ref ^= VP9_LAST_FLAG; - - if (flags & VP8_EFLAG_NO_REF_GF) - ref ^= VP9_GOLD_FLAG; - - if (flags & VP8_EFLAG_NO_REF_ARF) - ref ^= VP9_ALT_FLAG; - - vp9_use_as_reference(ctx->cpi, ref); - } - - if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | - VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF | - VP8_EFLAG_FORCE_ARF)) { - int upd = 7; - - if (flags & VP8_EFLAG_NO_UPD_LAST) - upd ^= VP9_LAST_FLAG; - - if (flags & VP8_EFLAG_NO_UPD_GF) - upd ^= VP9_GOLD_FLAG; - - if (flags & VP8_EFLAG_NO_UPD_ARF) - upd ^= VP9_ALT_FLAG; - - vp9_update_reference(ctx->cpi, upd); - } - - if (flags & VP8_EFLAG_NO_UPD_ENTROPY) { - vp9_update_entropy(ctx->cpi, 0); - } + vp9_apply_encoding_flags(ctx->cpi, flags); // Handle fixed keyframe intervals if (ctx->cfg.kf_mode == VPX_KF_AUTO && @@ -731,9 +872,11 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // Initialize the encoder instance on the first frame. if (res == VPX_CODEC_OK && ctx->cpi != NULL) { - unsigned int lib_flags; + unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp, dst_end_time_stamp; + int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); + int64_t dst_end_time_stamp = + timebase_units_to_ticks(timebase, pts + duration); size_t size, cx_data_sz; unsigned char *cx_data; @@ -741,19 +884,12 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1; - // Convert API flags to internal codec lib flags - lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - - /* vp9 use 10,000,000 ticks/second as time stamp */ - dst_time_stamp = (pts * 10000000 * ctx->cfg.g_timebase.num) - / ctx->cfg.g_timebase.den; - dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / - ctx->cfg.g_timebase.den; - if (img != NULL) { res = image2yuvconfig(img, &sd); - if (vp9_receive_raw_frame(ctx->cpi, lib_flags, + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(ctx->cpi, flags, &sd, dst_time_stamp, dst_end_time_stamp)) { VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; res = update_error_state(ctx, &cpi->common.error); @@ -762,7 +898,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data = ctx->cx_data; cx_data_sz = ctx->cx_data_sz; - lib_flags = 0; /* Any pending invisible frames? */ if (ctx->pending_cx_data) { @@ -785,12 +920,21 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img)) { if (size) { - vpx_codec_pts_t round, delta; - vpx_codec_cx_pkt_t pkt; VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; + vpx_codec_cx_pkt_t pkt; + +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) + cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size; +#endif // Pack invisible frames with the next visible frame - if (cpi->common.show_frame == 0) { + if (cpi->common.show_frame == 0 +#if CONFIG_SPATIAL_SVC + || (is_spatial_svc(cpi) && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) +#endif + ) { if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; ctx->pending_cx_data_sz += size; @@ -802,18 +946,19 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } // Add the frame packet to the list of returned packets. - round = (vpx_codec_pts_t)1000000 * ctx->cfg.g_timebase.num / 2 - 1; - delta = (dst_end_time_stamp - dst_time_stamp); pkt.kind = VPX_CODEC_CX_FRAME_PKT; - pkt.data.frame.pts = - (dst_time_stamp * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000; - pkt.data.frame.duration = (unsigned long) - ((delta * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000); + pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp); + pkt.data.frame.duration = + (unsigned long)ticks_to_timebase_units(timebase, + dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = lib_flags << 16; - if (lib_flags & FRAMEFLAGS_KEY) + if (lib_flags & FRAMEFLAGS_KEY +#if CONFIG_SPATIAL_SVC + || (is_spatial_svc(cpi) && + cpi->svc.layer_context[0].is_key_frame) +#endif + ) pkt.data.frame.flags |= VPX_FRAME_IS_KEY; if (cpi->common.show_frame == 0) { @@ -823,9 +968,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // prior PTS so that if a decoder uses pts to schedule when // to do this, we start right after last frame was decoded. // Invisible frames have no duration. - pkt.data.frame.pts = ((cpi->last_time_stamp_seen - * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000) + 1; + pkt.data.frame.pts = + ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1; pkt.data.frame.duration = 0; } @@ -851,6 +995,18 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); cx_data += size; cx_data_sz -= size; +#if CONFIG_SPATIAL_SVC + if (is_spatial_svc(cpi)) { + vpx_codec_cx_pkt_t pkt = {0}; + int i; + pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size; + cpi->svc.layer_context[i].layer_size = 0; + } + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + } +#endif } } } @@ -858,14 +1014,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, return res; } - -static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx, +static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter); } static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *); if (frame != NULL) { @@ -881,7 +1036,7 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *); if (frame != NULL) { @@ -897,13 +1052,13 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { - vp9_ref_frame_t *frame = va_arg(args, vp9_ref_frame_t *); + va_list args) { + vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); if (frame != NULL) { - YV12_BUFFER_CONFIG* fb; + YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + if (fb == NULL) return VPX_CODEC_ERROR; - vp9_get_reference_enc(ctx->cpi, frame->idx, &fb); yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; } else { @@ -912,11 +1067,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { #if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *); - (void)ctr_id; - if (config != NULL) { ctx->preview_ppcfg = *config; return VPX_CODEC_OK; @@ -925,7 +1078,6 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, } #else (void)ctx; - (void)ctr_id; (void)args; return VPX_CODEC_INCAPABLE; #endif @@ -934,7 +1086,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = {0}; + vp9_ppflags_t flags; + vp9_zero(flags); if (ctx->preview_ppcfg.post_proc_flag) { flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag; @@ -951,39 +1104,46 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { } static vpx_codec_err_t ctrl_update_entropy(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { const int update = va_arg(args, int); + vp9_update_entropy(ctx->cpi, update); return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_update_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { const int ref_frame_flags = va_arg(args, int); + vp9_update_reference(ctx->cpi, ref_frame_flags); return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_use_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { const int reference_flag = va_arg(args, int); + vp9_use_as_reference(ctx->cpi, reference_flag); return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { + (void)ctx; + (void)args; + // TODO(yaowu): Need to re-implement and test for VP9. return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *); if (map) { - if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols)) + if (!vp9_set_active_map(ctx->cpi, map->active_map, + (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; else return VPX_CODEC_INVALID_PARAM; @@ -993,7 +1153,7 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); if (mode) { @@ -1006,10 +1166,10 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, - va_list args) { +static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { int data = va_arg(args, int); const vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + vp9_set_svc(ctx->cpi, data); // CBR or two pass mode for SVC with both temporal and spatial layers // not yet supported. @@ -1025,11 +1185,11 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, } static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *); VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; SVC *const svc = &cpi->svc; + svc->spatial_layer_id = data->spatial_layer_id; svc->temporal_layer_id = data->temporal_layer_id; // Checks on valid layer_id input. @@ -1045,32 +1205,34 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { VP9_COMP *const cpi = ctx->cpi; vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *); - if (params == NULL) + if (params == NULL || params->spatial_layer < 0 || + params->spatial_layer >= cpi->svc.number_spatial_layers) return VPX_CODEC_INVALID_PARAM; - cpi->svc.spatial_layer_id = params->spatial_layer; - cpi->svc.temporal_layer_id = params->temporal_layer; - - cpi->lst_fb_idx = params->lst_fb_idx; - cpi->gld_fb_idx = params->gld_fb_idx; - cpi->alt_fb_idx = params->alt_fb_idx; - - if (vp9_set_size_literal(ctx->cpi, params->width, params->height) != 0) - return VPX_CODEC_INVALID_PARAM; - - ctx->cfg.rc_max_quantizer = params->max_quantizer; - ctx->cfg.rc_min_quantizer = params->min_quantizer; + if (params->spatial_layer == 0) { + int i; + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + cpi->svc.layer_context[i].svc_params_received.spatial_layer = -1; + } + } - set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - vp9_change_config(ctx->cpi, &ctx->oxcf); + cpi->svc.layer_context[params->spatial_layer].svc_params_received = + *params; return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, {VP8E_UPD_ENTROPY, ctrl_update_entropy}, @@ -1083,30 +1245,31 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8E_SET_ROI_MAP, ctrl_set_roi_map}, {VP8E_SET_ACTIVEMAP, ctrl_set_active_map}, {VP8E_SET_SCALEMODE, ctrl_set_scale_mode}, - {VP8E_SET_CPUUSED, ctrl_set_param}, - {VP8E_SET_NOISE_SENSITIVITY, ctrl_set_param}, - {VP8E_SET_ENABLEAUTOALTREF, ctrl_set_param}, - {VP8E_SET_SHARPNESS, ctrl_set_param}, - {VP8E_SET_STATIC_THRESHOLD, ctrl_set_param}, - {VP9E_SET_TILE_COLUMNS, ctrl_set_param}, - {VP9E_SET_TILE_ROWS, ctrl_set_param}, - {VP8E_SET_ARNR_MAXFRAMES, ctrl_set_param}, - {VP8E_SET_ARNR_STRENGTH, ctrl_set_param}, - {VP8E_SET_ARNR_TYPE, ctrl_set_param}, - {VP8E_SET_TUNING, ctrl_set_param}, - {VP8E_SET_CQ_LEVEL, ctrl_set_param}, - {VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_param}, - {VP9E_SET_LOSSLESS, ctrl_set_param}, - {VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_param}, - {VP9E_SET_AQ_MODE, ctrl_set_param}, - {VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_param}, + {VP8E_SET_CPUUSED, ctrl_set_cpuused}, + {VP8E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity}, + {VP8E_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref}, + {VP8E_SET_SHARPNESS, ctrl_set_sharpness}, + {VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh}, + {VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns}, + {VP9E_SET_TILE_ROWS, ctrl_set_tile_rows}, + {VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames}, + {VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength}, + {VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type}, + {VP8E_SET_TUNING, ctrl_set_tuning}, + {VP8E_SET_CQ_LEVEL, ctrl_set_cq_level}, + {VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct}, + {VP9E_SET_LOSSLESS, ctrl_set_lossless}, + {VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode}, + {VP9E_SET_AQ_MODE, ctrl_set_aq_mode}, + {VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost}, {VP9E_SET_SVC, ctrl_set_svc}, {VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters}, {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, + {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, // Getters - {VP8E_GET_LAST_QUANTIZER, ctrl_get_param}, - {VP8E_GET_LAST_QUANTIZER_64, ctrl_get_param}, + {VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer}, + {VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64}, {VP9_GET_REFERENCE, ctrl_get_reference}, { -1, NULL}, @@ -1132,12 +1295,15 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 0, // rc_dropframe_thresh 0, // rc_resize_allowed + 1, // rc_scaled_width + 1, // rc_scaled_height 60, // rc_resize_down_thresold 30, // rc_resize_up_thresold VPX_VBR, // rc_end_usage #if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION) - {0}, // rc_twopass_stats_in + {NULL, 0}, // rc_twopass_stats_in + {NULL, 0}, // rc_firstpass_mb_stats_in #endif 256, // rc_target_bandwidth 0, // rc_min_quantizer @@ -1159,6 +1325,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 9999, // kf_max_dist VPX_SS_DEFAULT_LAYERS, // ss_number_layers + {0}, {0}, // ss_target_bitrate 1, // ts_number_layers {0}, // ts_target_bitrate @@ -1170,7 +1337,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { #endif } }, - { -1, {NOT_IMPLEMENTED}} }; #ifndef VERSION_STRING @@ -1183,21 +1349,21 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { encoder_init, // vpx_codec_init_fn_t encoder_destroy, // vpx_codec_destroy_fn_t encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t - NOT_IMPLEMENTED, // vpx_codec_get_mmap_fn_t - NOT_IMPLEMENTED, // vpx_codec_set_mmap_fn_t { // NOLINT NOT_IMPLEMENTED, // vpx_codec_peek_si_fn_t NOT_IMPLEMENTED, // vpx_codec_get_si_fn_t NOT_IMPLEMENTED, // vpx_codec_decode_fn_t NOT_IMPLEMENTED, // vpx_codec_frame_get_fn_t + NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t }, { // NOLINT + 1, // 1 cfg map encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t encoder_encode, // vpx_codec_encode_fn_t encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t encoder_set_config, // vpx_codec_enc_config_set_fn_t NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t encoder_get_preview, // vpx_codec_get_preview_frame_fn_t - NOT_IMPLEMENTED , // vpx_codec_enc_mr_get_mem_loc_fn_t + NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t } }; diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index 5ed7484ab..4372ac9e5 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -20,6 +20,7 @@ #include "vp9/common/vp9_frame_buffers.h" #include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/vp9_iface_common.h" @@ -32,21 +33,16 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; vp9_stream_info_t si; - int decoder_init; struct VP9Decoder *pbi; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; -#if CONFIG_POSTPROC_VISUALIZER - unsigned int dbg_postproc_flag; - int dbg_color_ref_frame_flag; - int dbg_color_mb_modes_flag; - int dbg_color_b_modes_flag; - int dbg_display_mv_flag; -#endif + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; vpx_image_t img; - int img_setup; int img_avail; + int flushed; int invert_tile_order; + int frame_parallel_decode; // frame-based threading. // External frame buffer info to save for VP9 common. void *ext_priv; // Private data associated with the external frame buffers. @@ -55,10 +51,12 @@ struct vpx_codec_alg_priv { }; static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data) { + vpx_codec_priv_enc_mr_cfg_t *data) { // This function only allocates space for the vpx_codec_alg_priv_t // structure. More memory may be required at the time the stream // information becomes known. + (void)data; + if (!ctx->priv) { vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv)); if (alg_priv == NULL) @@ -72,6 +70,12 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, ctx->priv->alg_priv = alg_priv; ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); ctx->priv->init_flags = ctx->init_flags; + ctx->priv->alg_priv->flushed = 0; + ctx->priv->alg_priv->frame_parallel_decode = + (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING); + + // Disable frame parallel decoding for now. + ctx->priv->alg_priv->frame_parallel_decode = 0; if (ctx->config.dec) { // Update the reference to the config structure to an internal copy. @@ -94,11 +98,38 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { return VPX_CODEC_OK; } -static vpx_codec_err_t decoder_peek_si(const uint8_t *data, - unsigned int data_sz, - vpx_codec_stream_info_t *si) { - if (data_sz <= 8) - return VPX_CODEC_UNSUP_BITSTREAM; +static int parse_bitdepth_colorspace_sampling( + BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) { + const int sRGB = 7; + int colorspace; + if (profile >= PROFILE_2) + rb->bit_offset += 1; // Bit-depth 10 or 12. + colorspace = vp9_rb_read_literal(rb, 3); + if (colorspace != sRGB) { + rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range. + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 2; // subsampling x/y. + rb->bit_offset += 1; // unused. + } + } else { + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 1; // unused + } else { + // RGB is only available in version 1. + return 0; + } + } + return 1; +} + +static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si, + int *is_intra_only, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + int intra_only_flag = 0; + uint8_t clear_buffer[9]; if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM; @@ -106,59 +137,71 @@ static vpx_codec_err_t decoder_peek_si(const uint8_t *data, si->is_kf = 0; si->w = si->h = 0; + if (decrypt_cb) { + data_sz = MIN(sizeof(clear_buffer), data_sz); + decrypt_cb(decrypt_state, data, clear_buffer, data_sz); + data = clear_buffer; + } + { + int show_frame; + int error_resilient; struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; const int frame_marker = vp9_rb_read_literal(&rb, 2); - const int version = vp9_rb_read_bit(&rb); - (void) vp9_rb_read_bit(&rb); // unused version bit + const BITSTREAM_PROFILE profile = vp9_read_profile(&rb); if (frame_marker != VP9_FRAME_MARKER) return VPX_CODEC_UNSUP_BITSTREAM; - if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM; + + if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM; if (vp9_rb_read_bit(&rb)) { // show an existing frame + vp9_rb_read_literal(&rb, 3); // Frame buffer to show. return VPX_CODEC_OK; } + if (data_sz <= 8) + return VPX_CODEC_UNSUP_BITSTREAM; + si->is_kf = !vp9_rb_read_bit(&rb); - if (si->is_kf) { - const int sRGB = 7; - int colorspace; + show_frame = vp9_rb_read_bit(&rb); + error_resilient = vp9_rb_read_bit(&rb); - rb.bit_offset += 1; // show frame - rb.bit_offset += 1; // error resilient + if (si->is_kf) { + if (!vp9_read_sync_code(&rb)) + return VPX_CODEC_UNSUP_BITSTREAM; - if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 || - vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 || - vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) { + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) return VPX_CODEC_UNSUP_BITSTREAM; - } + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); + } else { + intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb); - colorspace = vp9_rb_read_literal(&rb, 3); - if (colorspace != sRGB) { - rb.bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range - if (version == 1) { - rb.bit_offset += 2; // subsampling x/y - rb.bit_offset += 1; // has extra plane - } - } else { - if (version == 1) { - rb.bit_offset += 1; // has extra plane - } else { - // RGB is only available in version 1 + rb.bit_offset += error_resilient ? 0 : 2; // reset_frame_context + + if (intra_only_flag) { + if (!vp9_read_sync_code(&rb)) return VPX_CODEC_UNSUP_BITSTREAM; + if (profile > PROFILE_0) { + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; } + rb.bit_offset += REF_FRAMES; // refresh_frame_flags + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); } - - // TODO(jzern): these are available on non-keyframes in intra only mode. - si->w = vp9_rb_read_literal(&rb, 16) + 1; - si->h = vp9_rb_read_literal(&rb, 16) + 1; } } - + if (is_intra_only != NULL) + *is_intra_only = intra_only_flag; return VPX_CODEC_OK; } +static vpx_codec_err_t decoder_peek_si(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL); +} + static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx, vpx_codec_stream_info_t *si) { const size_t sz = (si->sz >= sizeof(vp9_stream_info_t)) @@ -208,37 +251,20 @@ static void set_default_ppflags(vp8_postproc_cfg_t *cfg) { static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { flags->post_proc_flag = -#if CONFIG_POSTPROC_VISUALIZER - (ctx->dbg_color_ref_frame_flag ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) | - (ctx->dbg_color_mb_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) | - (ctx->dbg_color_b_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) | - (ctx->dbg_display_mv_flag ? VP9D_DEBUG_DRAW_MV : 0) | -#endif ctx->postproc_cfg.post_proc_flag; flags->deblocking_level = ctx->postproc_cfg.deblocking_level; flags->noise_level = ctx->postproc_cfg.noise_level; -#if CONFIG_POSTPROC_VISUALIZER - flags->display_ref_frame_flag = ctx->dbg_color_ref_frame_flag; - flags->display_mb_modes_flag = ctx->dbg_color_mb_modes_flag; - flags->display_b_modes_flag = ctx->dbg_color_b_modes_flag; - flags->display_mv_flag = ctx->dbg_display_mv_flag; -#endif } static void init_decoder(vpx_codec_alg_priv_t *ctx) { - VP9D_CONFIG oxcf; - oxcf.width = ctx->si.w; - oxcf.height = ctx->si.h; - oxcf.version = 9; - oxcf.max_threads = ctx->cfg.threads; - oxcf.inv_tile_order = ctx->invert_tile_order; - - ctx->pbi = vp9_decoder_create(&oxcf); + ctx->pbi = vp9_decoder_create(); if (ctx->pbi == NULL) return; - vp9_initialize_dec(); + ctx->pbi->max_threads = ctx->cfg.threads; + ctx->pbi->inv_tile_order = ctx->invert_tile_order; + ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode; // If postprocessing was enabled by the application and a // configuration has not been provided, default it. @@ -252,41 +278,51 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) { static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, void *user_priv, int64_t deadline) { - YV12_BUFFER_CONFIG sd = { 0 }; - int64_t time_stamp = 0, time_end_stamp = 0; - vp9_ppflags_t flags = {0}; + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; VP9_COMMON *cm = NULL; + (void)deadline; + + vp9_zero(sd); ctx->img_avail = 0; // Determine the stream parameters. Note that we rely on peek_si to // validate that we have a buffer that does not wrap around the top // of the heap. if (!ctx->si.h) { + int is_intra_only = 0; const vpx_codec_err_t res = - ctx->base.iface->dec.peek_si(*data, data_sz, &ctx->si); + decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only, + ctx->decrypt_cb, ctx->decrypt_state); if (res != VPX_CODEC_OK) return res; + + if (!ctx->si.is_kf && !is_intra_only) + return VPX_CODEC_ERROR; } // Initialize the decoder instance on the first frame - if (!ctx->decoder_init) { + if (ctx->pbi == NULL) { init_decoder(ctx); if (ctx->pbi == NULL) return VPX_CODEC_ERROR; - - ctx->decoder_init = 1; } + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + ctx->pbi->decrypt_cb = ctx->decrypt_cb; + ctx->pbi->decrypt_state = ctx->decrypt_state; + cm = &ctx->pbi->common; - if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) + if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) return update_error_state(ctx, &cm->error); if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); - if (vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags)) + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags)) return update_error_state(ctx, &cm->error); yuvconfig2image(&ctx->img, &sd, user_priv); @@ -296,12 +332,32 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } -static void parse_superframe_index(const uint8_t *data, size_t data_sz, - uint32_t sizes[8], int *count) { +static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb, + void *decrypt_state, + const uint8_t *data) { + if (decrypt_cb) { + uint8_t marker; + decrypt_cb(decrypt_state, data, &marker, 1); + return marker; + } + return *data; +} + +static vpx_codec_err_t parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + uint8_t marker; assert(data_sz); - marker = data[data_sz - 1]; + marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1); *count = 0; if ((marker & 0xe0) == 0xc0) { @@ -309,85 +365,149 @@ static void parse_superframe_index(const uint8_t *data, size_t data_sz, const uint32_t mag = ((marker >> 3) & 0x3) + 1; const size_t index_sz = 2 + mag * frames; - if (data_sz >= index_sz && data[data_sz - index_sz] == marker) { - // found a valid superframe index + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) + return VPX_CODEC_CORRUPT_FRAME; + + { + const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state, + data + data_sz - index_sz); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) + return VPX_CODEC_CORRUPT_FRAME; + } + + { + // Found a valid superframe index. uint32_t i, j; const uint8_t *x = &data[data_sz - index_sz + 1]; - for (i = 0; i < frames; i++) { + // Frames has a maximum of 8 and mag has a maximum of 4. + uint8_t clear_buffer[32]; + assert(sizeof(clear_buffer) >= frames * mag); + if (decrypt_cb) { + decrypt_cb(decrypt_state, x, clear_buffer, frames * mag); + x = clear_buffer; + } + + for (i = 0; i < frames; ++i) { uint32_t this_sz = 0; - for (j = 0; j < mag; j++) + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); sizes[i] = this_sz; } - *count = frames; } } + return VPX_CODEC_OK; } static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { const uint8_t *data_start = data; - const uint8_t *data_end = data + data_sz; - vpx_codec_err_t res = VPX_CODEC_OK; - uint32_t sizes[8]; - int frames_this_pts, frame_count = 0; + const uint8_t * const data_end = data + data_sz; + vpx_codec_err_t res; + uint32_t frame_sizes[8]; + int frame_count; - if (data == NULL || data_sz == 0) - return VPX_CODEC_INVALID_PARAM; + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } - parse_superframe_index(data, data_sz, sizes, &frames_this_pts); - - do { - // Skip over the superframe index, if present - if (data_sz && (*data_start & 0xe0) == 0xc0) { - const uint8_t marker = *data_start; - const uint32_t frames = (marker & 0x7) + 1; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const uint32_t index_sz = 2 + mag * frames; - - if (data_sz >= index_sz && data_start[index_sz - 1] == marker) { - data_start += index_sz; - data_sz -= index_sz; - if (data_start < data_end) - continue; - else - break; - } - } + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; + + res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) + return res; + + if (ctx->frame_parallel_decode) { + // Decode in frame parallel mode. When decoding in this mode, the frame + // passed to the decoder must be either a normal frame or a superframe with + // superframe index so the decoder could get each frame's start position + // in the superframe. + if (frame_count > 0) { + int i; + + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data + || frame_size > (uint32_t) (data_end - data_start)) { + ctx->base.err_detail = "Invalid frame size in index"; + return VPX_CODEC_CORRUPT_FRAME; + } - // Use the correct size for this frame, if an index is present. - if (frames_this_pts) { - uint32_t this_sz = sizes[frame_count]; + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, + deadline); + if (res != VPX_CODEC_OK) + return res; - if (data_sz < this_sz) { - ctx->base.err_detail = "Invalid frame size in index"; - return VPX_CODEC_CORRUPT_FRAME; + data_start += frame_size; + } + } else { + res = decode_one(ctx, &data_start, data_sz, user_priv, deadline); + if (res != VPX_CODEC_OK) + return res; + + // Extra data detected after the frame. + if (data_start < data_end - 1) { + ctx->base.err_detail = "Fail to decode frame in parallel mode"; + return VPX_CODEC_INCAPABLE; } - - data_sz = this_sz; - frame_count++; } + } else { + // Decode in serial mode. + if (frame_count > 0) { + int i; + + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data + || frame_size > (uint32_t) (data_end - data_start)) { + ctx->base.err_detail = "Invalid frame size in index"; + return VPX_CODEC_CORRUPT_FRAME; + } - res = decode_one(ctx, &data_start, data_sz, user_priv, deadline); - assert(data_start >= data); - assert(data_start <= data_end); - - // Early exit if there was a decode error - if (res) - break; - - // Account for suboptimal termination by the encoder. - while (data_start < data_end && *data_start == 0) - data_start++; + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, + deadline); + if (res != VPX_CODEC_OK) + return res; - data_sz = (unsigned int)(data_end - data_start); - } while (data_start < data_end); + data_start += frame_size; + } + } else { + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t) (data_end - data_start); + const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size, + user_priv, deadline); + if (res != VPX_CODEC_OK) + return res; + + // Account for suboptimal termination by the encoder. + while (data_start < data_end) { + const uint8_t marker = read_marker(ctx->decrypt_cb, + ctx->decrypt_state, data_start); + if (marker) + break; + ++data_start; + } + } + } + } - return res; + return VPX_CODEC_OK; } static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, @@ -426,7 +546,7 @@ static vpx_codec_err_t decoder_set_fb_fn( } static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); if (data) { @@ -442,7 +562,7 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); if (data) { @@ -459,13 +579,13 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); if (data) { - YV12_BUFFER_CONFIG* fb; + YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx); + if (fb == NULL) return VPX_CODEC_ERROR; - vp9_get_reference_dec(ctx->pbi, data->idx, &fb); yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; } else { @@ -474,7 +594,7 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { #if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); @@ -486,32 +606,21 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } #else + (void)ctx; + (void)args; return VPX_CODEC_INCAPABLE; #endif } static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { -#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - int data = va_arg(args, int); - -#define MAP(id, var) case id: var = data; break; - - switch (ctrl_id) { - MAP(VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag); - MAP(VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag); - MAP(VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag); - MAP(VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag); - } - - return VPX_CODEC_OK; -#else + va_list args) { + (void)ctx; + (void)args; return VPX_CODEC_INCAPABLE; -#endif } static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { + va_list args) { int *const update_info = va_arg(args, int *); if (update_info) { @@ -527,14 +636,13 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { + va_list args) { int *corrupted = va_arg(args, int *); - if (corrupted) { - if (ctx->pbi) - *corrupted = ctx->pbi->common.frame_to_show->corrupted; - else - return VPX_CODEC_ERROR; + if (corrupted != NULL && ctx->pbi != NULL) { + const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; @@ -542,7 +650,7 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { + va_list args) { int *const display_size = va_arg(args, int *); if (display_size) { @@ -560,11 +668,19 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { + va_list args) { ctx->invert_tile_order = va_arg(args, int); return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); + ctx->decrypt_cb = init ? init->decrypt_cb : NULL; + ctx->decrypt_state = init ? init->decrypt_state : NULL; + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, @@ -576,6 +692,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { {VP8_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options}, {VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options}, {VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order}, + {VPXD_SET_DECRYPTOR, ctrl_set_decryptor}, // Getters {VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates}, @@ -597,8 +714,6 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { decoder_init, // vpx_codec_init_fn_t decoder_destroy, // vpx_codec_destroy_fn_t decoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t - NOT_IMPLEMENTED, // vpx_codec_get_mmap_fn_t - NOT_IMPLEMENTED, // vpx_codec_set_mmap_fn_t { // NOLINT decoder_peek_si, // vpx_codec_peek_si_fn_t decoder_get_si, // vpx_codec_get_si_fn_t @@ -607,11 +722,13 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { decoder_set_fb_fn, // vpx_codec_set_fb_fn_t }, { // NOLINT - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED + 0, + NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t + NOT_IMPLEMENTED, // vpx_codec_encode_fn_t + NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t + NOT_IMPLEMENTED, // vpx_codec_enc_config_set_fn_t + NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t + NOT_IMPLEMENTED, // vpx_codec_get_preview_frame_fn_t + NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t } }; diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h index 58256b22b..fc98b62c5 100644 --- a/libvpx/vp9/vp9_iface_common.h +++ b/libvpx/vp9/vp9_iface_common.h @@ -16,9 +16,11 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, * the Y, U, and V planes, nor other alignment adjustments that * might be representable by a YV12_BUFFER_CONFIG, so we just * initialize all the fields.*/ - int bps = 12; - if (yv12->uv_height == yv12->y_height) { - if (yv12->uv_width == yv12->y_width) { + const int ss_x = yv12->uv_crop_width < yv12->y_crop_width; + const int ss_y = yv12->uv_crop_height < yv12->y_crop_height; + int bps; + if (!ss_y) { + if (!ss_x) { img->fmt = VPX_IMG_FMT_I444; bps = 24; } else { @@ -27,21 +29,23 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, } } else { img->fmt = VPX_IMG_FMT_I420; + bps = 12; } + img->bit_depth = 8; img->w = yv12->y_stride; img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); img->d_w = yv12->y_crop_width; img->d_h = yv12->y_crop_height; - img->x_chroma_shift = yv12->uv_width < yv12->y_width; - img->y_chroma_shift = yv12->uv_height < yv12->y_height; + img->x_chroma_shift = ss_x; + img->y_chroma_shift = ss_y; img->planes[VPX_PLANE_Y] = yv12->y_buffer; img->planes[VPX_PLANE_U] = yv12->u_buffer; img->planes[VPX_PLANE_V] = yv12->v_buffer; - img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; img->stride[VPX_PLANE_Y] = yv12->y_stride; img->stride[VPX_PLANE_U] = yv12->uv_stride; img->stride[VPX_PLANE_V] = yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; img->bps = bps; img->user_priv = user_priv; img->img_data = yv12->buffer_alloc; @@ -54,7 +58,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->y_buffer = img->planes[VPX_PLANE_Y]; yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; - yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA]; yv12->y_crop_width = img->d_w; yv12->y_crop_height = img->d_h; @@ -66,21 +69,10 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; - yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0; - yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0; - yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; - yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0; yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; -#if CONFIG_ALPHA - // For development purposes, force alpha to hold the same data as Y for now. - yv12->alpha_buffer = yv12->y_buffer; - yv12->alpha_width = yv12->y_width; - yv12->alpha_height = yv12->y_height; - yv12->alpha_stride = yv12->y_stride; -#endif return VPX_CODEC_OK; } diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index da6c0f8b6..dc46c4e35 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -18,9 +18,13 @@ VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) VP9_CX_SRCS-yes += vp9_cx_iface.c VP9_CX_SRCS-yes += encoder/vp9_bitstream.c +VP9_CX_SRCS-yes += encoder/vp9_context_tree.c +VP9_CX_SRCS-yes += encoder/vp9_context_tree.h VP9_CX_SRCS-yes += encoder/vp9_cost.h VP9_CX_SRCS-yes += encoder/vp9_cost.c VP9_CX_SRCS-yes += encoder/vp9_dct.c +VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c +VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h VP9_CX_SRCS-yes += encoder/vp9_encodemb.c @@ -40,9 +44,10 @@ VP9_CX_SRCS-yes += encoder/vp9_firstpass.h VP9_CX_SRCS-yes += encoder/vp9_lookahead.c VP9_CX_SRCS-yes += encoder/vp9_lookahead.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.h -VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h +VP9_CX_SRCS-yes += encoder/vp9_encoder.h VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h +VP9_CX_SRCS-yes += encoder/vp9_rd.h VP9_CX_SRCS-yes += encoder/vp9_rdopt.h VP9_CX_SRCS-yes += encoder/vp9_pickmode.h VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h @@ -50,11 +55,12 @@ VP9_CX_SRCS-yes += encoder/vp9_tokenize.h VP9_CX_SRCS-yes += encoder/vp9_treewriter.h VP9_CX_SRCS-yes += encoder/vp9_variance.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.c -VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c +VP9_CX_SRCS-yes += encoder/vp9_encoder.c VP9_CX_SRCS-yes += encoder/vp9_picklpf.c VP9_CX_SRCS-yes += encoder/vp9_picklpf.h VP9_CX_SRCS-yes += encoder/vp9_quantize.c VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c +VP9_CX_SRCS-yes += encoder/vp9_rd.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_pickmode.c VP9_CX_SRCS-yes += encoder/vp9_sad.c @@ -87,8 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h - -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm @@ -96,31 +100,39 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm ifeq ($(CONFIG_USE_X86INC),yes) +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm endif ifeq ($(ARCH_X86_64),yes) -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm endif VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm -VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm +VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c + +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk index 92ec6fd16..1fcb36f66 100644 --- a/libvpx/vp9/vp9dx.mk +++ b/libvpx/vp9/vp9dx.mk @@ -31,8 +31,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.h VP9_DX_SRCS-yes += decoder/vp9_decoder.c VP9_DX_SRCS-yes += decoder/vp9_decoder.h -VP9_DX_SRCS-yes += decoder/vp9_thread.c -VP9_DX_SRCS-yes += decoder/vp9_thread.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h |