aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vp9/common
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp9/common')
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c9
-rw-r--r--libvpx/vp9/common/vp9_alloccommon.c34
-rw-r--r--libvpx/vp9/common/vp9_entropymode.c4
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.c8
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.h4
-rw-r--r--libvpx/vp9/common/vp9_onyxc_int.h40
-rw-r--r--libvpx/vp9/common/vp9_postproc.c6
-rw-r--r--libvpx/vp9/common/vp9_reconinter.h10
-rw-r--r--libvpx/vp9/common/vp9_rtcd_defs.pl147
-rw-r--r--libvpx/vp9/common/vp9_thread_common.c7
-rw-r--r--libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c114
-rw-r--r--libvpx/vp9/common/x86/vp9_mfqe_sse2.asm2
12 files changed, 195 insertions, 190 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index dd1ea03b6..025254c3f 100644
--- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,14 +14,7 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
-
-static int16_t sinpi_1_9 = 0x14a3;
-static int16_t sinpi_2_9 = 0x26c9;
-static int16_t sinpi_3_9 = 0x3441;
-static int16_t sinpi_4_9 = 0x3b6c;
-static int16_t cospi_8_64 = 0x3b21;
-static int16_t cospi_16_64 = 0x2d41;
-static int16_t cospi_24_64 = 0x187e;
+#include "vpx_dsp/txfm_common.h"
static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
int32x4_t q8s32, q9s32;
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 66aa733b9..7345e259b 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,24 +17,6 @@
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_onyxc_int.h"
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
- pthread_mutex_lock(&pool->pool_mutex);
-#else
- (void)pool;
-#endif
-}
-
-void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
- pthread_mutex_unlock(&pool->pool_mutex);
-#else
- (void)pool;
-#endif
-}
-
void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
cm->prev_seg_map_idx = 1;
cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
- if (!cm->frame_parallel_decode)
- cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+ cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
return 0;
}
@@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) {
}
cm->current_frame_seg_map = NULL;
-
- if (!cm->frame_parallel_decode) {
- cm->last_frame_seg_map = NULL;
- }
+ cm->last_frame_seg_map = NULL;
}
void vp9_free_ref_frame_buffers(BufferPool *pool) {
int i;
for (i = 0; i < FRAME_BUFFERS; ++i) {
- if (pool->frame_bufs[i].ref_count > 0 &&
+ if (!pool->frame_bufs[i].released &&
pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
pool->frame_bufs[i].ref_count = 0;
+ pool->frame_bufs[i].released = 1;
}
vpx_free(pool->frame_bufs[i].mvs);
pool->frame_bufs[i].mvs = NULL;
@@ -176,6 +155,9 @@ fail:
}
void vp9_remove_common(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(cm);
+#endif
vp9_free_context_buffers(cm);
vpx_free(cm->fc);
@@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) {
void vp9_init_context_buffers(VP9_COMMON *cm) {
cm->setup_mi(cm);
- if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ if (cm->last_frame_seg_map)
memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
}
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index bcb9e8f29..47cd63e94 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
vp9_clearall_segfeatures(&cm->seg);
cm->seg.abs_delta = SEGMENT_DELTADATA;
- if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ if (cm->last_frame_seg_map)
memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
if (cm->current_frame_seg_map)
@@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
}
// prev_mip will only be allocated in encoder.
- if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+ if (frame_is_intra_only(cm) && cm->prev_mip)
memset(cm->prev_mip, 0,
cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index ef0297dd5..c7c343aed 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -1612,12 +1612,14 @@ void vp9_loop_filter_data_reset(
void vp9_reset_lfm(VP9_COMMON *const cm) {
if (cm->lf.filter_level) {
- memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) *
- cm->lf.lfm_stride * sizeof(*cm->lf.lfm));
+ memset(cm->lf.lfm, 0,
+ ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+ sizeof(*cm->lf.lfm));
}
}
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+int vp9_loop_filter_worker(void *arg1, void *unused) {
+ LFWorkerData *const lf_data = (LFWorkerData *)arg1;
(void)unused;
loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only);
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index da37a6ebd..481a6cdc6 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset(
LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
-// Operates on the rows described by 'lf_data'.
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+// Operates on the rows described by 'arg1' (cast to LFWorkerData *).
+int vp9_loop_filter_worker(void *arg1, void *unused);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 32db7b7aa..1d96d92c2 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -37,13 +37,10 @@ extern "C" {
#define REF_FRAMES_LOG2 3
#define REF_FRAMES (1 << REF_FRAMES_LOG2)
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
+// 1 scratch frame for the new frame, 3 for scaled references on the encoder.
// TODO(jkoleszar): These 3 extra references could probably come from the
// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+#define FRAME_BUFFERS (REF_FRAMES + 4)
#define FRAME_CONTEXTS_LOG2 2
#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -72,30 +69,12 @@ typedef struct {
MV_REF *mvs;
int mi_rows;
int mi_cols;
+ uint8_t released;
vpx_codec_frame_buffer_t raw_frame_buffer;
YV12_BUFFER_CONFIG buf;
-
- // The Following variables will only be used in frame parallel decode.
-
- // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
- // that no FrameWorker owns, or is decoding, this buffer.
- VPxWorker *frame_worker_owner;
-
- // row and col indicate which position frame has been decoded to in real
- // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
- // when the frame is fully decoded.
- int row;
- int col;
} RefCntBuffer;
typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
- pthread_mutex_t pool_mutex;
-#endif
-
// Private data associated with the frame buffer callbacks.
void *cb_priv;
@@ -235,10 +214,6 @@ typedef struct VP9Common {
struct loopfilter lf;
struct segmentation seg;
- // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
- // in pbi.
- int frame_parallel_decode; // frame-based threading.
-
// Context probabilities for reference frame prediction
MV_REFERENCE_FRAME comp_fixed_ref;
MV_REFERENCE_FRAME comp_var_ref[2];
@@ -283,11 +258,6 @@ typedef struct VP9Common {
int above_context_alloc_cols;
} VP9_COMMON;
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool);
-void unlock_buffer_pool(BufferPool *const pool);
-
static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
if (index < 0 || index >= REF_FRAMES) return NULL;
if (cm->ref_frame_map[index] < 0) return NULL;
@@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
int i;
- lock_buffer_pool(cm->buffer_pool);
for (i = 0; i < FRAME_BUFFERS; ++i)
if (frame_bufs[i].ref_count == 0) break;
@@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
i = INVALID_IDX;
}
- unlock_buffer_pool(cm->buffer_pool);
return i;
}
@@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm,
xd->partition_probs =
frame_is_intra_only(cm)
? &vp9_kf_partition_probs[0]
- : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+ : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob;
}
static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index b105e5d45..dfc315eea 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
// if mfqe is enabled. Need to take both the quality and the speed
// into consideration.
if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
- vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+ vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
}
if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
@@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
cm->postproc_state.limits);
} else {
- vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+ vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
}
} else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
@@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
} else if (flags & VP9D_DEBLOCK) {
vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
} else {
- vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+ vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
ppstate->last_base_qindex = cm->base_qindex;
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 1b09b380d..bb9291a26 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
const struct scale_factors *sf, int w, int h,
int ref, const InterpKernel *kernel, int xs,
int ys) {
- sf->predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
- ys, w, h);
+ sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
+ dst_stride, kernel, subpel_x,
+ xs, subpel_y, ys, w, h);
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor(
const int subpel_x, const int subpel_y, const struct scale_factors *sf,
int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
- ys, w, h, bd);
+ src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
+ h, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index baf63e97f..22b67ecac 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
sub vp9_common_forward_decls() {
print <<EOF
/*
@@ -30,6 +40,7 @@ if ($opts{arch} eq "x86_64") {
$ssse3_x86_64 = 'ssse3';
$avx_x86_64 = 'avx';
$avx2_x86_64 = 'avx2';
+ $avx512_x86_64 = 'avx512';
}
#
@@ -46,41 +57,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
#
# dct
#
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- } else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2/;
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2/;
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/vp9_iht16x16_256_add sse2/;
- }
-} else {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- } else {
- add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
-
- add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+# Force C versions if CONFIG_EMULATE_HARDWARE is 1
+add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+
+add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ # Note that there are more specializations appended when
+ # CONFIG_VP9_HIGHBITDEPTH is off.
+ specialize qw/vp9_iht4x4_16_add sse2/;
+ specialize qw/vp9_iht8x8_64_add sse2/;
+ specialize qw/vp9_iht16x16_256_add sse2/;
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+ # Note that these specializations are appended to the above ones.
+ specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
+ specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+ specialize qw/vp9_iht16x16_256_add dspr2 msa/;
}
}
@@ -124,82 +118,69 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
specialize qw/vp9_denoiser_filter neon sse2/;
}
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/vp9_block_error avx2 sse2/;
+add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/vp9_highbd_block_error sse2/;
+add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
- add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
- specialize qw/vp9_block_error_fp sse2/;
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ specialize qw/vp9_block_error avx2 sse2/;
+
+ specialize qw/vp9_block_error_fp avx2 sse2/;
- add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant neon ssse3/;
+
+ add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/vp9_highbd_block_error sse2/;
} else {
- add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2 msa sse2/;
- add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
- specialize qw/vp9_block_error_fp neon sse2/;
-
- add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+ specialize qw/vp9_block_error_fp neon avx2 sse2/;
- add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
-
- add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
}
# fdct functions
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht4x4 sse2/;
-
- add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht8x8 sse2/;
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht16x16 sse2/;
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4 sse2/;
-} else {
- add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht4x4 sse2 msa/;
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht8x8 sse2 msa/;
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht16x16 sse2 msa/;
-
- add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4 msa sse2/;
+# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
+# is off.
+specialize qw/vp9_fht4x4 sse2/;
+specialize qw/vp9_fht8x8 sse2/;
+specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fwht4x4 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+ # Note that these specializations are appended to the above ones.
+ specialize qw/vp9_fht4x4 msa/;
+ specialize qw/vp9_fht8x8 msa/;
+ specialize qw/vp9_fht16x16 msa/;
+ specialize qw/vp9_fwht4x4 msa/;
}
#
# Motion search
#
-add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-specialize qw/vp9_full_search_sad sse3 sse4_1/;
-$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
-$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
-
add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
specialize qw/vp9_diamond_search_sad avx/;
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse4_1/;
+}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -227,7 +208,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# frame based scale
#
add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler";
-specialize qw/vp9_scale_and_extend_frame ssse3/;
+specialize qw/vp9_scale_and_extend_frame neon ssse3/;
}
# end encoder functions
diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c
index 07e659d23..8d44e91f2 100644
--- a/libvpx/vp9/common/vp9_thread_common.c
+++ b/libvpx/vp9/common/vp9_thread_common.c
@@ -140,8 +140,9 @@ static INLINE void thread_loop_filter_rows(
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(VP9LfSync *const lf_sync,
- LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+ VP9LfSync *const lf_sync = (VP9LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_sync);
@@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
VPxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
- worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+ worker->hook = loop_filter_row_worker;
worker->data1 = lf_sync;
worker->data2 = lf_data;
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index bb2dcf52b..6996260e2 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
__m128i in[2];
const __m128i eight = _mm_set1_epi16(8);
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
@@ -54,18 +54,17 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[8];
- const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
- in[4] = load_input_data(input + 8 * 4);
- in[5] = load_input_data(input + 8 * 5);
- in[6] = load_input_data(input + 8 * 6);
- in[7] = load_input_data(input + 8 * 7);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 1);
+ in[2] = load_input_data8(input + 8 * 2);
+ in[3] = load_input_data8(input + 8 * 3);
+ in[4] = load_input_data8(input + 8 * 4);
+ in[5] = load_input_data8(input + 8 * 5);
+ in[6] = load_input_data8(input + 8 * 6);
+ in[7] = load_input_data8(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
@@ -106,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[6] = _mm_srai_epi16(in[6], 5);
in[7] = _mm_srai_epi16(in[7], 5);
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+
+ in[8] = load_input_data8(input + 8 * 16);
+ in[9] = load_input_data8(input + 9 * 16);
+ in[10] = load_input_data8(input + 10 * 16);
+ in[11] = load_input_data8(input + 11 * 16);
+ in[12] = load_input_data8(input + 12 * 16);
+ in[13] = load_input_data8(input + 13 * 16);
+ in[14] = load_input_data8(input + 14 * 16);
+ in[15] = load_input_data8(input + 15 * 16);
+}
+
+static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
+ const int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
+ recon_and_store(dest + 8 * stride, in[8]);
+ recon_and_store(dest + 9 * stride, in[9]);
+ recon_and_store(dest + 10 * stride, in[10]);
+ recon_and_store(dest + 11 * stride, in[11]);
+ recon_and_store(dest + 12 * stride, in[12]);
+ recon_and_store(dest + 13 * stride, in[13]);
+ recon_and_store(dest + 14 * stride, in[14]);
+ recon_and_store(dest + 15 * stride, in[15]);
}
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
diff --git a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
index 30852049b..ca0897ab9 100644
--- a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
+++ b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -12,6 +12,8 @@
; TODO(jackychen): Find a way to fix the duplicate.
%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
+
;void vp9_filter_by_weight16x16_sse2
;(
; unsigned char *src,