diff options
Diffstat (limited to 'libvpx/vp9/common')
-rw-r--r-- | libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c | 9 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_alloccommon.c | 34 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_entropymode.c | 4 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_loopfilter.c | 8 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_loopfilter.h | 4 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_onyxc_int.h | 40 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_postproc.c | 6 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_reconinter.h | 10 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_rtcd_defs.pl | 147 | ||||
-rw-r--r-- | libvpx/vp9/common/vp9_thread_common.c | 7 | ||||
-rw-r--r-- | libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c | 114 | ||||
-rw-r--r-- | libvpx/vp9/common/x86/vp9_mfqe_sse2.asm | 2 |
12 files changed, 195 insertions, 190 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index dd1ea03b6..025254c3f 100644 --- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,14 +14,7 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" - -static int16_t sinpi_1_9 = 0x14a3; -static int16_t sinpi_2_9 = 0x26c9; -static int16_t sinpi_3_9 = 0x3441; -static int16_t sinpi_4_9 = 0x3b6c; -static int16_t cospi_8_64 = 0x3b21; -static int16_t cospi_16_64 = 0x2d41; -static int16_t cospi_24_64 = 0x187e; +#include "vpx_dsp/txfm_common.h" static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { int32x4_t q8s32, q9s32; diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index 66aa733b9..7345e259b 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -17,24 +17,6 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_onyxc_int.h" -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_lock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - -void unlock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_unlock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); @@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { cm->prev_seg_map_idx = 1; cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; - if (!cm->frame_parallel_decode) - cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; return 0; } @@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) { } cm->current_frame_seg_map = NULL; - - if (!cm->frame_parallel_decode) { - cm->last_frame_seg_map = NULL; - } + cm->last_frame_seg_map = NULL; } void vp9_free_ref_frame_buffers(BufferPool *pool) { int i; for (i = 0; i < FRAME_BUFFERS; ++i) { - if (pool->frame_bufs[i].ref_count > 0 && + if (!pool->frame_bufs[i].released && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); pool->frame_bufs[i].ref_count = 0; + pool->frame_bufs[i].released = 1; } vpx_free(pool->frame_bufs[i].mvs); pool->frame_bufs[i].mvs = NULL; @@ -176,6 +155,9 @@ fail: } void vp9_remove_common(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif vp9_free_context_buffers(cm); vpx_free(cm->fc); @@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) { void vp9_init_context_buffers(VP9_COMMON *cm) { cm->setup_mi(cm); - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index bcb9e8f29..47cd63e94 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vp9_clearall_segfeatures(&cm->seg); cm->seg.abs_delta = SEGMENT_DELTADATA; - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); if (cm->current_frame_seg_map) @@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { } // prev_mip will only be allocated in encoder. - if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode) + if (frame_is_intra_only(cm) && cm->prev_mip) memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index ef0297dd5..c7c343aed 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -1612,12 +1612,14 @@ void vp9_loop_filter_data_reset( void vp9_reset_lfm(VP9_COMMON *const cm) { if (cm->lf.filter_level) { - memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * - cm->lf.lfm_stride * sizeof(*cm->lf.lfm)); + memset(cm->lf.lfm, 0, + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride * + sizeof(*cm->lf.lfm)); } } -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { +int vp9_loop_filter_worker(void *arg1, void *unused) { + LFWorkerData *const lf_data = (LFWorkerData *)arg1; (void)unused; loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index da37a6ebd..481a6cdc6 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset( LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); -// Operates on the rows described by 'lf_data'. -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); +// Operates on the rows described by 'arg1' (cast to LFWorkerData *). +int vp9_loop_filter_worker(void *arg1, void *unused); #ifdef __cplusplus } // extern "C" #endif diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index 32db7b7aa..1d96d92c2 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -37,13 +37,10 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 4 scratch frames for the new frames to support a maximum of 4 cores decoding -// in parallel, 3 for scaled references on the encoder. -// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number -// of framebuffers. +// 1 scratch frame for the new frame, 3 for scaled references on the encoder. // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 7) +#define FRAME_BUFFERS (REF_FRAMES + 4) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -72,30 +69,12 @@ typedef struct { MV_REF *mvs; int mi_rows; int mi_cols; + uint8_t released; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; - - // The Following variables will only be used in frame parallel decode. - - // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means - // that no FrameWorker owns, or is decoding, this buffer. - VPxWorker *frame_worker_owner; - - // row and col indicate which position frame has been decoded to in real - // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX - // when the frame is fully decoded. - int row; - int col; } RefCntBuffer; typedef struct BufferPool { -// Protect BufferPool from being accessed by several FrameWorkers at -// the same time during frame parallel decode. -// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. -#if CONFIG_MULTITHREAD - pthread_mutex_t pool_mutex; -#endif - // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -235,10 +214,6 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; - // TODO(hkuang): Remove this as it is the same as frame_parallel_decode - // in pbi. - int frame_parallel_decode; // frame-based threading. - // Context probabilities for reference frame prediction MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; @@ -283,11 +258,6 @@ typedef struct VP9Common { int above_context_alloc_cols; } VP9_COMMON; -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool); -void unlock_buffer_pool(BufferPool *const pool); - static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; if (cm->ref_frame_map[index] < 0) return NULL; @@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; - lock_buffer_pool(cm->buffer_pool); for (i = 0; i < FRAME_BUFFERS; ++i) if (frame_bufs[i].ref_count == 0) break; @@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { i = INVALID_IDX; } - unlock_buffer_pool(cm->buffer_pool); return i; } @@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm, xd->partition_probs = frame_is_intra_only(cm) ? &vp9_kf_partition_probs[0] - : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob; + : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob; } static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd, diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index b105e5d45..dfc315eea 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, // if mfqe is enabled. Need to take both the quality and the speed // into consideration. if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { - vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); + vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); } if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf, @@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); + vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); } } else if (flags & VP9D_DEMACROBLOCK) { deblock_and_de_macro_block(cm->frame_to_show, ppbuf, @@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, } else if (flags & VP9D_DEBLOCK) { vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); + vpx_yv12_copy_frame(cm->frame_to_show, ppbuf); } ppstate->last_base_qindex = cm->base_qindex; diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 1b09b380d..bb9291a26 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys) { - sf->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h); + sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst, + dst_stride, kernel, subpel_x, + xs, subpel_y, ys, w, h); } #if CONFIG_VP9_HIGHBITDEPTH @@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor( const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h, bd); + src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w, + h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl index baf63e97f..22b67ecac 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp9_common_forward_decls() { print <<EOF /* @@ -30,6 +40,7 @@ if ($opts{arch} eq "x86_64") { $ssse3_x86_64 = 'ssse3'; $avx_x86_64 = 'avx'; $avx2_x86_64 = 'avx2'; + $avx512_x86_64 = 'avx512'; } # @@ -46,41 +57,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/; # # dct # -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - } else { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2/; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2/; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp9_iht16x16_256_add sse2/; - } -} else { - # Force C versions if CONFIG_EMULATE_HARDWARE is 1 - if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - } else { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/; - - add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/; +# Force C versions if CONFIG_EMULATE_HARDWARE is 1 +add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; + +add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; + +add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + +if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + # Note that there are more specializations appended when + # CONFIG_VP9_HIGHBITDEPTH is off. + specialize qw/vp9_iht4x4_16_add sse2/; + specialize qw/vp9_iht8x8_64_add sse2/; + specialize qw/vp9_iht16x16_256_add sse2/; + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { + # Note that these specializations are appended to the above ones. + specialize qw/vp9_iht4x4_16_add neon dspr2 msa/; + specialize qw/vp9_iht8x8_64_add neon dspr2 msa/; + specialize qw/vp9_iht16x16_256_add dspr2 msa/; } } @@ -124,82 +118,69 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { specialize qw/vp9_denoiser_filter neon sse2/; } -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/vp9_block_error avx2 sse2/; +add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/vp9_highbd_block_error sse2/; +add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; - add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; - specialize qw/vp9_block_error_fp sse2/; +add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; +add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64"; - add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; +add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + specialize qw/vp9_block_error avx2 sse2/; + + specialize qw/vp9_block_error_fp avx2 sse2/; - add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant neon ssse3/; + + add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; + specialize qw/vp9_highbd_block_error sse2/; } else { - add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error avx2 msa sse2/; - add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size"; - specialize qw/vp9_block_error_fp neon sse2/; - - add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; + specialize qw/vp9_block_error_fp neon avx2 sse2/; - add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; - - add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; } # fdct functions -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht4x4 sse2/; - - add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht8x8 sse2/; +add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht16x16 sse2/; +add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4 sse2/; -} else { - add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht4x4 sse2 msa/; +add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht8x8 sse2 msa/; +add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht16x16 sse2 msa/; - - add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4 msa sse2/; +# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH +# is off. +specialize qw/vp9_fht4x4 sse2/; +specialize qw/vp9_fht8x8 sse2/; +specialize qw/vp9_fht16x16 sse2/; +specialize qw/vp9_fwht4x4 sse2/; +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { + # Note that these specializations are appended to the above ones. + specialize qw/vp9_fht4x4 msa/; + specialize qw/vp9_fht8x8 msa/; + specialize qw/vp9_fht16x16 msa/; + specialize qw/vp9_fwht4x4 msa/; } # # Motion search # -add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv"; -specialize qw/vp9_full_search_sad sse3 sse4_1/; -$vp9_full_search_sad_sse3=vp9_full_search_sadx3; -$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8; - add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx/; +if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; specialize qw/vp9_temporal_filter_apply sse4_1/; +} if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -227,7 +208,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # frame based scale # add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler"; -specialize qw/vp9_scale_and_extend_frame ssse3/; +specialize qw/vp9_scale_and_extend_frame neon ssse3/; } # end encoder functions diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c index 07e659d23..8d44e91f2 100644 --- a/libvpx/vp9/common/vp9_thread_common.c +++ b/libvpx/vp9/common/vp9_thread_common.c @@ -140,8 +140,9 @@ static INLINE void thread_loop_filter_rows( } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(VP9LfSync *const lf_sync, - LFWorkerData *const lf_data) { +static int loop_filter_row_worker(void *arg1, void *arg2) { + VP9LfSync *const lf_sync = (VP9LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); @@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, VPxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (VPxWorkerHook)loop_filter_row_worker; + worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index bb2dcf52b..6996260e2 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, __m128i in[2]; const __m128i eight = _mm_set1_epi16(8); - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); switch (tx_type) { case 0: // DCT_DCT @@ -54,18 +54,17 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); - in[4] = load_input_data(input + 8 * 4); - in[5] = load_input_data(input + 8 * 5); - in[6] = load_input_data(input + 8 * 6); - in[7] = load_input_data(input + 8 * 7); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 1); + in[2] = load_input_data8(input + 8 * 2); + in[3] = load_input_data8(input + 8 * 3); + in[4] = load_input_data8(input + 8 * 4); + in[5] = load_input_data8(input + 8 * 5); + in[6] = load_input_data8(input + 8 * 6); + in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT @@ -106,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); +} + +static INLINE void load_buffer_8x16(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); + + in[8] = load_input_data8(input + 8 * 16); + in[9] = load_input_data8(input + 9 * 16); + in[10] = load_input_data8(input + 10 * 16); + in[11] = load_input_data8(input + 11 * 16); + in[12] = load_input_data8(input + 12 * 16); + in[13] = load_input_data8(input + 13 * 16); + in[14] = load_input_data8(input + 14 * 16); + in[15] = load_input_data8(input + 15 * 16); +} + +static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in, + const int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); + recon_and_store(dest + 8 * stride, in[8]); + recon_and_store(dest + 9 * stride, in[9]); + recon_and_store(dest + 10 * stride, in[10]); + recon_and_store(dest + 11 * stride, in[11]); + recon_and_store(dest + 12 * stride, in[12]); + recon_and_store(dest + 13 * stride, in[13]); + recon_and_store(dest + 14 * stride, in[14]); + recon_and_store(dest + 15 * stride, in[15]); } void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, diff --git a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm index 30852049b..ca0897ab9 100644 --- a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm +++ b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm @@ -12,6 +12,8 @@ ; TODO(jackychen): Find a way to fix the duplicate. %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp9_filter_by_weight16x16_sse2 ;( ; unsigned char *src, |